2382 lines (2382 with data), 155.3 kB
{
"metadata": {
"kernelspec": {
"language": "python",
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.10.13",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"kaggle": {
"accelerator": "none",
"dataSources": [
{
"sourceId": 68479,
"databundleVersionId": 7609535,
"sourceType": "competition"
}
],
"dockerImageVersionId": 30646,
"isInternetEnabled": true,
"language": "python",
"sourceType": "script",
"isGpuEnabled": false
},
"colab": {
"name": "PREDICTION OF OBESITY RLEVELS USING ML((LightGBM) ",
"provenance": []
}
},
"nbformat_minor": 0,
"nbformat": 4,
"cells": [
{
"source": [
"\n",
"# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES\n",
"# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,\n",
"# THEN FEEL FREE TO DELETE THIS CELL.\n",
"# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON\n",
"# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR\n",
"# NOTEBOOK.\n",
"\n",
"import os\n",
"import sys\n",
"from tempfile import NamedTemporaryFile\n",
"from urllib.request import urlopen\n",
"from urllib.parse import unquote, urlparse\n",
"from urllib.error import HTTPError\n",
"from zipfile import ZipFile\n",
"import tarfile\n",
"import shutil\n",
"\n",
"CHUNK_SIZE = 40960\n",
"DATA_SOURCE_MAPPING = 'playground-series-s4e2:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F68479%2F7609535%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240228%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240228T164410Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D989f079bef77b10c9d9ec53aec6e8ebe17c4be8e87a9aa02afe6a84140b98100b75cdfea760b8c76fc6e797f9419959f5f0b6e9f52d7167a8184de82cb5a9b72bfc80fbdd4065793259c9cc61f3aadf2b4612c6ea1c7761fde1d1550837c15369bb28073f6a7c92248523391ae6a7a4a4cb19e462f8bea6cee185b02288b21603b0d1a1f2975540d94b33ebd6d96859bc9cd3b89a5dfc5352b227d02741a6d74172e8031b7c0232fdfc11d14df8f85e420c20c240574a651e4a5e837054e0d8d40a4cdce56c9f2c2cedf7042983f5722107f8b17d58f4b3597de3c726b0b1e32e38966093413faf3d41e1eb96031f65f0ac5386259609701c51396274f1e7f5e'\n",
"\n",
"KAGGLE_INPUT_PATH='/kaggle/input'\n",
"KAGGLE_WORKING_PATH='/kaggle/working'\n",
"KAGGLE_SYMLINK='kaggle'\n",
"\n",
"!umount /kaggle/input/ 2> /dev/null\n",
"shutil.rmtree('/kaggle/input', ignore_errors=True)\n",
"os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)\n",
"os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)\n",
"\n",
"try:\n",
" os.symlink(KAGGLE_INPUT_PATH, os.path.join(\"..\", 'input'), target_is_directory=True)\n",
"except FileExistsError:\n",
" pass\n",
"try:\n",
" os.symlink(KAGGLE_WORKING_PATH, os.path.join(\"..\", 'working'), target_is_directory=True)\n",
"except FileExistsError:\n",
" pass\n",
"\n",
"for data_source_mapping in DATA_SOURCE_MAPPING.split(','):\n",
" directory, download_url_encoded = data_source_mapping.split(':')\n",
" download_url = unquote(download_url_encoded)\n",
" filename = urlparse(download_url).path\n",
" destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)\n",
" try:\n",
" with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:\n",
" total_length = fileres.headers['content-length']\n",
" print(f'Downloading {directory}, {total_length} bytes compressed')\n",
" dl = 0\n",
" data = fileres.read(CHUNK_SIZE)\n",
" while len(data) > 0:\n",
" dl += len(data)\n",
" tfile.write(data)\n",
" done = int(50 * dl / int(total_length))\n",
" sys.stdout.write(f\"\\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded\")\n",
" sys.stdout.flush()\n",
" data = fileres.read(CHUNK_SIZE)\n",
" if filename.endswith('.zip'):\n",
" with ZipFile(tfile) as zfile:\n",
" zfile.extractall(destination_path)\n",
" else:\n",
" with tarfile.open(tfile.name) as tarfile:\n",
" tarfile.extractall(destination_path)\n",
" print(f'\\nDownloaded and uncompressed: {directory}')\n",
" except HTTPError as e:\n",
" print(f'Failed to load (likely expired) {download_url} to path {destination_path}')\n",
" continue\n",
" except OSError as e:\n",
" print(f'Failed to load {download_url} to path {destination_path}')\n",
" continue\n",
"\n",
"print('Data source import complete.')\n"
],
"metadata": {
"id": "GfL4wQJ3mknp"
},
"cell_type": "code",
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"source": [
"# %% [markdown]\n",
"# ## **<span style=\"color:red\">COMPREHENSIVE ANALYSIS AND PREDICTION OF OBESITY RISK LEVELS USING MACHINE LEARNING TECHNIQUES WITH - (LightGBM) MODEL</span>**\n",
"# **Author**: **Anamika Kumari**\n",
"\n",
"# %% [markdown]\n",
"# ## Table of Contents\n",
"#\n",
"# | Table of Contents |\n",
"# |----------------------------------------|\n",
"# | **[Section: 1.Introduction](#Section:-1.-Introduction:)** |\n",
"# |[]() |\n",
"# | [1. What is Obesity?](#What-is-Obesity:) |\n",
"# | [2. Understanding Obesity and Risk Prediction](#Understanding-Obesity-and-Risk-Prediction:) |\n",
"# | [3. Dataset Overview](#Dataset-Overview:) |\n",
"# |[]() |\n",
"# |**[Section: 2.Importing Libraries and Dataset](#Section:-2.Importing-Libraries-and-Dataset:)** |\n",
"# |[]() |\n",
"# | [1. Importing Relevent Libraries](#Importing-Relevent-Libraries:) |\n",
"# | [2. Loading Datasets](#Loading-Datasets:) |\n",
"# |[]() |\n",
"# |**[Section: 3. Descriptive Analysis](#Section:-3.-Descriptive-Analysis:)** |\n",
"# |[]() |\n",
"# | [1. Summary Statistic of dataframe](#1.-Summary-Statistic-of-dataframe:) |\n",
"# | [2. The unique values present in dataset](#2.-The-unique-values-present-in-dataset:) |\n",
"# | [3. The count of unique value in the NObeyesdad column](#3.-The-count-of-unique-value-in-the-NObeyesdad-column:) |\n",
"# | [4. Categorical and numerical Variables Analysis](#4.-Categorical-and-numerical-Variables-Analysis:) |\n",
"# |[]() |\n",
"# |[]() |\n",
"# | [a. Extracting column names for categorical, numerical, and categorical but cardinal variables](#a.-Extracting-column-names-for-categorical,-numerical,-and-categorical-but-cardinal-variables:) |\n",
"# | [b. Summary Of All Categorical Variables](#b.-Summary-Of-All-Categorical-Variables:) |\n",
"# | [c. Summary Of All Numerical Variables](#c.-Summary-Of-All-Numerical-Variables:) |\n",
"# |[]() |\n",
"# |[]() |\n",
"# |**[Section: 4. Data Preprocessing](#Section:-4.-Data-Preprocessing:)** |\n",
"# |[]() |\n",
"# | [1. Typeconversion of dataframe](#1.-Typeconversion-of-dataframe:) |\n",
"# | [2. Renaming the Columns](#2.-Renaming-the-Columns:) |\n",
"# | [3. Detecting Columns with Large or Infinite Values](#3.-Detecting-Columns-with-Large-or-Infinite-Values:) |\n",
"# |[]() |\n",
"# |**[Section:5. Exploratory Data Analysis and Visualisation-EDAV](#Section:5.-Exploratory-Data-Analysis-and-Visualisation-EDAV:)** |\n",
"# |[]() |\n",
"# |**[1. Univariate Analysis](#1.-Univariate-Analysis)** |\n",
"# |[]() |\n",
"# |[]() |\n",
"# | [a. Countplots for all Variables](#a.-Countplots-for-all-Variables:) |\n",
"# | [b. Analyzing Individual Variables Using Histogram](#b.-Analyzing-Individual-Variables-Using-Histogram:) |\n",
"# | [c. KDE Plots of Numerical Columns](#c.-KDE-Plots-of-Numerical-Columns:) |\n",
"# | [d. Pie Chart and Barplot for categorical variables](#d.-Pie-Chart-and-Barplot-for-categorical-variables:) |\n",
"# | [e. Violin Plot and Box Plot for Numerical variables](#e.-Violin-Plot-and-Box-Plot-for-Numerical-variables:) |\n",
"# |[]() |\n",
"# |[]() |\n",
"# |**[2. Bivariate Analysis](#2.-Bivariate-Analysis:)** |\n",
"# |[]() |\n",
"# |[]() |\n",
"# | [a. Scatter plot: AGE V/s Weight with Obesity Level](#a.-Scatter-plot:-AGE-V/s-Weight-with-Obesity-Level:) |\n",
"# | [b. Scatter plot: AGE V/s Height with Obesity Level](#b.-Scatter-plot:-AGE-V/s-Height-with-Obesity-Level:) |\n",
"# | [c. Scatter plot: Height V/s Weight with Obesity Level](#c.-Scatter-plot:-Height-V/s-Weight-with-Obesity-Level:) |\n",
"# | [d. Scatter plot: AGE V/s Weight with Overweighted Family History](#d.-Scatter-plot:-AGE-V/s-Weight-with-Overweighted-Family-History:) |\n",
"# | [e. Scatter plot: AGE V/s height with Overweighted Family History](#e.-Scatter-plot:-AGE-V/s-height-with-Overweighted-Family-History:) |\n",
"# | [f. Scatter plot: Height V/s Weight with Overweighted Family History](#f.-Scatter-plot:-Height-V/s-Weight-with-Overweighted-Family-History:) |\n",
"# | [g. Scatter plot: AGE V/s Weight with Transport use](#g.-Scatter-plot:-AGE-V/s-Weight-with-Transport-use:) |\n",
"# | [h. Scatter plot: AGE V/s Height with Transport use](#h.-Scatter-plot:-AGE-V/s-Height-with-Transport-use:) |\n",
"# | [i. Scatter plot: Height V/s Weight with Transport use](#i.-Scatter-plot:-Height-V/s-Weight-with-Transport-use:) |\n",
"# |[]() |\n",
"# |[]() |\n",
"# |**[3. Multivariate Analysis](#3.-Multivariate-Analysis:)** |\n",
"# |[]() |\n",
"# |[]() |\n",
"# | [a. Pair Plot of Variables against Obesity Levels](#a.-Pair-Plot-of-Variables-against-Obesity-Levels:) |\n",
"# | [b. Correlation heatmap for Pearson's correlation coefficient](#b.-Correlation-heatmap-for-Pearson's-correlation-coefficient:) |\n",
"# | [c. Correlation heatmap for Kendall's tau correlation coefficient](#c.-Correlation-heatmap-for-Kendall's-tau-correlation-coefficient:) |\n",
"# | [d. 3D Scatter Plot of Numerical Columns against Obesity Level](#d.-3D-Scatter-Plot-of-Numerical-Columns-against-Obesity-Level:) |\n",
"# |[]() |\n",
"# |**[e. Cluster Analysis](#e.-Cluster-Analysis:)** |\n",
"# |[]() |\n",
"# |[]() |\n",
"# | [I. K-Means Clustering on Obesity level](#I.-K-Means-Clustering-on-Obesity-level:) |\n",
"# | [II. PCA Plot of numerical variables against obesity level](#II.-PCA-Plot-of-numerical-variables-against-obesity-level:) |\n",
"# |[]() |\n",
"# |[]() |\n",
"# |**[4. Outlier Analysis](#4.-Outlier-Analysis:)** |\n",
"# |[]() |\n",
"# | [a. Univariate Outlier Analysis](#a.-Univariate-Outlier-Analysis:) |\n",
"# |[]() |\n",
"# |[]() |\n",
"# | [I. Boxplot Outlier Analysis](#I.-Boxplot-Outlier-Analysis:) |\n",
"# | [II. Detecting outliers using Z-Score](#II.-Detecting-outliers-using-Z-Score:) |\n",
"# | [III. Detecting outliers using Interquartile Range (IQR)](#III.-Detecting-outliers-using-Interquartile-Range-(IQR): ) |\n",
"# |[]() |\n",
"# |[]() |\n",
"# | [b. Multivariate Outlier Analysis](#b.-Multivariate-Outlier-Analysis:) |\n",
"# |[]() |\n",
"# | [I. Detecting Multivariate Outliers Using Mahalanobis Distance](#I.-Detecting-Multivariate-Outliers-Using-Mahalanobis-Distance:) |\n",
"# | [II. Detecting Multivariate Outliers Using Principal Component Analysis (PCA)](#II.-Detecting-Multivariate-Outliers-Using-Principal-Component-Analysis-(PCA):) |\n",
"# | [III. Detecting Cluster-Based Outliers Using KMeans Clustering](#III.-Detecting-Cluster-Based-Outliers-Using-KMeans-Clustering:) |\n",
"# |[]() |\n",
"# |[]() |\n",
"# |**[5. Feature Engineering:](#5.-Feature-Engineering:)** |\n",
"# |[]() |\n",
"# | [a. Encoding Categorical to numerical variables](#a.-Encoding-Categorical-to-numerical-variables:) |\n",
"# | [b. BMI(Body Mass Index) Calculation](#b.-BMI(Body-Mass-Index)-Calculation:) |\n",
"# | [c. Total Meal Consumed:](#c.-Total-Meal-Consumed:) |\n",
"# | [d. Total Activity Frequency Calculation](#d.-Total-Activity-Frequency-Calculation:) |\n",
"# | [e. Ageing process analysis](#e.-Ageing-process-analysis:) |\n",
"# |[]() |\n",
"# |**[Section: 6. Analysis & Prediction Using Machine Learning(ML) Model](#Section:-6.-Analysis-&-Prediction-Using-Machine-Learning(ML)-Model:)** |\n",
"# |[]() |\n",
"# | [1. Feature Importance Analysis and Visualization](#1.-Feature-Importance-Analysis-and-Visualization:) |\n",
"# |[]() |\n",
"# | [a. Feature Importance Analysis using Random Forest Classifier](#a.-Feature-Importance-Analysis--using-Random-Forest-Classifier:) |\n",
"# | [b. Feature Importance Analysis using XGBoost(XGB) Model](#b.-Feature-Importance-Analysis-using-XGBoost(XGB)-Model:) |\n",
"# | [c. Feature Importance Analysis Using (LightGBM) Classifier Model](#c.-Feature-Importance-Analysis-Using-(LightGBM)-Classifier-Model:) |\n",
"# |[]() |\n",
"# | [2. Data visualization after Feature Engineering](#2.-Data-visualization-after-Feature-Engineering:) |\n",
"# |[]() |\n",
"# | [a. Bar plot of numerical variables](#a.-Bar-plot-of-numerical-variables:) |\n",
"# | [b. PairPlot of Numerical Variables](#b.-PairPlot-of-Numerical-Variables:) |\n",
"# | [c. Correlation Heatmap of Numerical Variables](#c.-Correlation-Heatmap-of-Numerical-Variables:) |\n",
"# |[]() |\n",
"# |**[Section: 7. Prediction of Obesity Risk Level Using Machine learning(ML) Models](#Section:-7.-Prediction-of-Obesity-Risk-Level-Using-Machine-learning(ML)-Models:)** |\n",
"# |[]() |\n",
"# |[1. Machine Learning Model Creation: XGBoost and LightGBM and CatBoostClassifier - Powering The Predictions! 🚀](#1.-Machine-Learning-Model-Creation:-XGBoost-and-LightGBM-and-CatBoostClassifier---Powering-The-Predictions!-🚀) |\n",
"# | [2. Cutting-edge Machine Learning Model Evaluation: XGBoosting and LightGBM 🤖](#2.-Cutting-edge-Machine-Learning-Model-Evaluation:-XGBoosting-and-LightGBM-🤖) |\n",
"# | [3. Finding Best Model Out Of all Model](#3.-Finding-Best-Model-Out-Of-all-Model:) |\n",
"# | [4. Test Data Preprocessing for Prediction](#4.-Test-Data-Preprocessing-for-Prediction:) |\n",
"# | [5. Showcase Predicted Encdd_Obesity_Level Values on Test Dataset 📊](#5.-Showcase-Predicted-Encdd_Obesity_Level-Values-on-Test-Dataset-📊) |\n",
"# |[]() |\n",
"# |**[Section: 8. Conclusion: 📝](#Section:-8.-Conclusion:-📝)** |\n",
"# |[]() |\n",
"# | [Conclusion: 📝](#Conclusion:-📝) |\n",
"# | [It's time to make Submission:](#It's-time-to-make-Submission:) |\n",
"# |[]() |\n",
"# |[]() |\n",
"\n",
"# %% [markdown]\n",
"# # Section: 1. Introduction:\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">**What is Obesity:**</span>\n",
"#\n",
"#\n",
"# **Obesity** is a complex health condition affecting millions globally, with significant implications for morbidity, mortality, and healthcare costs.Obesity is a global concern, with statistics indicating a significant rise in the number of obese individuals, now accounting for approximately 30% of the global population, triple the figures from 1975. This escalating trend highlights the pressing need to address the multifaceted risks associated with excess weight. Obesity is a major contributor to various health complications, including diabetes, heart disease, osteoarthritis, sleep apnea, strokes, and high blood pressure, thereby significantly reducing life expectancy and increasing mortality rates. Effective prediction of obesity risk is crucial for implementing targeted interventions and promoting public health.\n",
"#\n",
"# In this project, we undertake a comprehensive analysis to predict obesity risk levels using advanced machine learning techniques.\n",
"\n",
"# %% [markdown]\n",
"# <img src=\"https://www.limarp.com/wp-content/uploads/2023/02/obesity-risk-factors.png\" alt=\"Obesity-Risk-Factors\" width=\"1500\">\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">**Understanding Obesity and Risk Prediction:**</span>\n",
"#\n",
"#\n",
"# - **Understanding Obesity:**\n",
"# - Obesity stems from excessive body fat accumulation, influenced by genetic, environmental, and behavioral factors.\n",
"# - Risk prediction involves analyzing demographics, lifestyle habits, and physical activity to classify individuals into obesity risk categories.\n",
"#\n",
"# - **Global Impact:**\n",
"# - Worldwide obesity rates have tripled since 1975, affecting 30% of the global population.\n",
"# - Urgent action is needed to develop effective risk prediction and management strategies.\n",
"#\n",
"# - **Factors Influencing Risk:**\n",
"# - Obesity risk is shaped by demographics, lifestyle habits, diet, physical activity, and medical history.\n",
"# - Analyzing these factors reveals insights into obesity's mechanisms and identifies high-risk populations.\n",
"#\n",
"# - **Data-Driven Approach:**\n",
"# - Advanced machine learning and large datasets enable the development of predictive models for stratifying obesity risk.\n",
"# - These models empower healthcare professionals and policymakers to implement tailored interventions for improved public health outcomes.\n",
"#\n",
"# - **Proactive Health Initiatives:**\n",
"# - Our proactive approach aims to combat obesity by leveraging data and technology for personalized prevention and management.\n",
"# - By predicting obesity risk, we aspire to create a future where interventions are precise, impactful, and tailored to individual needs.\n",
"#\n",
"# **Source**: **World Health Organization.** (2022). [Obesity and overweight](https://www.who.int/news-room/fact-sheets/detail/obesity-and-overweight).\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">**Dataset Overview:**</span>\n",
"#\n",
"# The dataset contains comprehensive information encompassing eating habits, physical activity, and demographic variables, comprising a total of 17\n",
"#\n",
"# ### Key Attributes Related to Eating Habits:\n",
"# - **Frequent Consumption of High-Caloric Food (FAVC):** Indicates the frequency of consuming high-caloric food items.\n",
"# - **Frequency of Consumption of Vegetables (FCVC):** Measures the frequency of consuming vegetables.\n",
"# - **Number of Main Meals (NCP):** Represents the count of main meals consumed per day.\n",
"# - **Consumption of Food Between Meals (CAEC):** Describes the pattern of food consumption between main meals.\n",
"# - **Consumption of Water Daily (CH20):** Quantifies the daily water intake.\n",
"# - **Consumption of Alcohol (CALC):** Indicates the frequency of alcohol consumption.\n",
"#\n",
"# ### Attributes Related to Physical Condition:\n",
"# - **Calories Consumption Monitoring (SCC):** Reflects the extent to which individuals monitor their calorie intake.\n",
"# - **Physical Activity Frequency (FAF):** Measures the frequency of engaging in physical activities.\n",
"# - **Time Using Technology Devices (TUE):** Indicates the duration spent using technology devices.\n",
"# - **Transportation Used (MTRANS):** Describes the mode of transportation typically used.\n",
"#\n",
"# Additionally, the dataset includes essential demographic variables such as gender, age, height, and weight, providing a comprehensive overview of individuals' characteristics.\n",
"#\n",
"# ### **Target Variable:**\n",
"# The target variable, NObesity, represents different obesity risk levels, categorized as:\n",
"#\n",
"# - **Underweight (BMI < 18.5):0**\n",
"# - **Normal (18.5 <= BMI < 20):1**\n",
"# - **Overweight I (20 <= BMI < 25):2**\n",
"# - **Overweight II (25 <= BMI < 30):3**\n",
"# - **Obesity I (30 <= BMI < 35):4**\n",
"# - **Obesity II (35 <= BMI < 40):5**\n",
"# - **Obesity III (BMI >= 40):6**\n",
"\n",
"# %% [markdown]\n",
"# # Section: 2.Importing Libraries and Dataset:\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">Importing Relevent Libraries:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:36:55.512087Z\",\"iopub.execute_input\":\"2024-02-28T14:36:55.512503Z\",\"iopub.status.idle\":\"2024-02-28T14:37:00.594027Z\",\"shell.execute_reply.started\":\"2024-02-28T14:36:55.512470Z\",\"shell.execute_reply\":\"2024-02-28T14:37:00.592706Z\"}}\n",
"import os # Operating system specific functionalities\n",
"import numpy as np # Linear algebra\n",
"import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)\n",
"from IPython.display import Image # Displaying images in Jupyter Notebook\n",
"import matplotlib.pyplot as plt # Plotting library\n",
"import seaborn as sns # Statistical data visualization\n",
"%matplotlib inline\n",
"import pickle as pkl # Python object serialization\n",
"import altair as alt # Declarative statistical visualization library\n",
"from tabulate import tabulate # Pretty-print tabular data\n",
"from colorama import Fore, Style # ANSI escape sequences for colored terminal text\n",
"from scipy.stats import pearsonr # Pearson correlation coefficient and p-value computation\n",
"from mpl_toolkits.mplot3d import Axes3D # 3D plotting toolkit for Matplotlib\n",
"from sklearn.cluster import KMeans # K-Means clustering algorithm\n",
"from sklearn.preprocessing import StandardScaler # Standardization of features\n",
"from sklearn.decomposition import PCA # Principal Component Analysis\n",
"from scipy.stats import chi2 # Chi-square distribution\n",
"from sklearn.ensemble import RandomForestClassifier # Random Forest classifier\n",
"import xgboost as xgb # XGBoost library for gradient boosting\n",
"import lightgbm as lgb # LightGBM library for gradient boosting\n",
"\n",
"# Import necessary libraries for model training and evaluation\n",
"from sklearn.model_selection import train_test_split # Splitting data into train and test sets\n",
"from xgboost import XGBClassifier # XGBoost classifier\n",
"from lightgbm import LGBMClassifier # LightGBM classifier\n",
"from catboost import CatBoostClassifier # CatBoost classifier\n",
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix # For model evaluation\n",
"\n",
"import warnings # Suppress warnings\n",
"warnings.filterwarnings('ignore')\n",
"pd.set_option('display.max_columns', None) # Display all columns in DataFrame\n",
"pd.set_option('display.max_rows', None) # Display all rows in DataFrame\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">Loading Datasets:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:00.597993Z\",\"iopub.execute_input\":\"2024-02-28T14:37:00.598779Z\",\"iopub.status.idle\":\"2024-02-28T14:37:00.831901Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:00.598735Z\",\"shell.execute_reply\":\"2024-02-28T14:37:00.830551Z\"}}\n",
"# Loading Datasets:\n",
"# Define filepath\n",
"filepath = os.path.join(\"/kaggle/input/playground-series-s4e2\")\n",
"\n",
"# Function for reading file from your current directory\n",
"def read_csv(filepath, filename):\n",
" # Read file from the specified path\n",
" df = pd.read_csv(os.path.join(filepath, filename))\n",
" return df\n",
"\n",
"# Give filepath and access all three file to read (In my case, it is 'train.csv','test.csv' and 'sample_submission.csv')\n",
"df_train = read_csv(filepath, 'train.csv')\n",
"test = read_csv(filepath,'test.csv')\n",
"test_sub=test.copy()\n",
"submission_df = read_csv(filepath,'sample_submission.csv')\n",
"\n",
"# %% [markdown]\n",
"# # Section: 3. Descriptive Analysis:\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:00.833856Z\",\"iopub.execute_input\":\"2024-02-28T14:37:00.834306Z\",\"iopub.status.idle\":\"2024-02-28T14:37:00.845175Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:00.834270Z\",\"shell.execute_reply\":\"2024-02-28T14:37:00.843258Z\"}}\n",
"print('Number of rows and columns:\\n')\n",
"df_train.shape\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:00.848241Z\",\"iopub.execute_input\":\"2024-02-28T14:37:00.848919Z\",\"iopub.status.idle\":\"2024-02-28T14:37:00.882917Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:00.848882Z\",\"shell.execute_reply\":\"2024-02-28T14:37:00.881684Z\"}}\n",
"df_train.head()\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:00.884255Z\",\"iopub.execute_input\":\"2024-02-28T14:37:00.884632Z\",\"iopub.status.idle\":\"2024-02-28T14:37:00.908834Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:00.884600Z\",\"shell.execute_reply\":\"2024-02-28T14:37:00.907476Z\"}}\n",
"test.head()\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:00.910400Z\",\"iopub.execute_input\":\"2024-02-28T14:37:00.910787Z\",\"iopub.status.idle\":\"2024-02-28T14:37:00.935691Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:00.910742Z\",\"shell.execute_reply\":\"2024-02-28T14:37:00.934226Z\"}}\n",
"df_train.tail()\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:00.937493Z\",\"iopub.execute_input\":\"2024-02-28T14:37:00.937911Z\",\"iopub.status.idle\":\"2024-02-28T14:37:00.991048Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:00.937878Z\",\"shell.execute_reply\":\"2024-02-28T14:37:00.989890Z\"}}\n",
"df_train.info()\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:00.992767Z\",\"iopub.execute_input\":\"2024-02-28T14:37:00.993265Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.003607Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:00.993223Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.002318Z\"}}\n",
"print(\"size of dataframe:\",df_train.size)\n",
"df_train.dtypes\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">1. Summary Statistic of dataframe:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.005285Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.006332Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.128562Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.006291Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.127549Z\"}}\n",
"df_train.describe().transpose().style.background_gradient(cmap='viridis').format(\"{:.2f}\")\n",
"\n",
"# %% [markdown]\n",
"# - **Count:** Number of non-null values for each feature. For instance, the 'Age' feature has 20,758 non-null values.\n",
"# - **Mean:** Average value of each feature across all observations. The mean age in the dataset is approximately 23.84 years.\n",
"# - **Std (Standard Deviation):** Measure of dispersion around the mean, indicating the extent of deviation from the mean value. The standard deviation of age is approximately 5.69 years.\n",
"# - **Min:** Minimum value observed for each feature. The minimum age in the dataset is 14 years.\n",
"# - **25%, 50% (Median), 75%:** Quartiles representing the data distribution. The median age (50th percentile) is approximately 22.82 years.\n",
"# - **Max:** Maximum value observed for each feature. The maximum age in the dataset is 61 years.\n",
"#\n",
"# These summary statistics provide insights into the distribution and variability of numerical features, facilitating a deeper understanding of the dataset's characteristics and informing subsequent analysis.\n",
"#\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.134246Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.135315Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.317209Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.135274Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.316167Z\"}}\n",
"def summary(dataframe):\n",
" print(f'Data shape: {dataframe.shape}') # Print the shape of the dataframe\n",
" summary_df = pd.DataFrame(dataframe.dtypes, columns=['Data Type']) # Create a dataframe to store summary information\n",
" summary_df['# Missing'] = dataframe.isnull().sum().values # Count the number of missing values for each column\n",
" summary_df['% Missing'] = (dataframe.isnull().sum().values / len(dataframe)) * 100 # Calculate the percentage of missing values for each column\n",
" summary_df['# Unique'] = dataframe.nunique().values # Count the number of unique values for each column\n",
" desc = pd.DataFrame(dataframe.describe(include='all').transpose()) # Create a descriptive statistics df & transpose it for easier merging\n",
" summary_df['Min'] = desc['min'].values # Add the minimum values from the descriptive statistics\n",
" summary_df['Max'] = desc['max'].values # Add the maximum values from the descriptive statistics\n",
"\n",
" return summary_df\n",
"\n",
"# Call the function with the dataframe \"df_train\" and display the summary\n",
"summary(df_train)\n",
"\n",
"\n",
"# %% [markdown]\n",
"# - **Data Shape:** The dataset contains 20,758 rows and 17 columns.\n",
"# - **Data Types:** The dataset consists of a mix of object (likely categorical) and float64 (likely numerical) data types.\n",
"# - **# Missing:** There are no missing values present in any of the columns.\n",
"# - **% Missing:** As there are no missing values, the percentage of missing values for all columns is 0.0%.\n",
"# - **# Unique:** Each column has a varying number of unique values, ranging from 2 to 1,703.\n",
"# - **Min:** Minimum values observed for numerical features range from 14.0 to 39.0.\n",
"# - **Max:** Maximum values observed for numerical features range from 61.0 to 165.057269.\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">2. The unique values present in dataset:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.318818Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.319310Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.367023Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.319269Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.365721Z\"}}\n",
"# Iterate through each column in the DataFrame\n",
"for col in df_train.columns:\n",
" # Get the unique values present in the current column\n",
" unique_values = df_train[col].unique()\n",
" # Print the column name along with its unique values\n",
" print(f\"Unique values in '{col}': {unique_values}\")\n",
"\n",
"\n",
"# %% [markdown]\n",
"# 1. **Age:** Age of the individual in years. (Unique values: 24.443011, 18.0, 20.952737, ...)\n",
"# 2. **Gender:** Gender of the individual, either Male or Female. (Unique values: Male, Female)\n",
"# 3. **Height:** Height of the individual in centimeters. (Unique values: 1.699998, 1.56, 1.71146, ...)\n",
"# 4. **Weight:** Weight of the individual in kilograms. (Unique values: 81.66995, 57.0, 50.165754, ...)\n",
"# 5. **Family_history:** Family history of obesity, either yes or no. (Unique values: yes, no)\n",
"# 6. **FAVC (Frequency of consuming high-caloric food):**\n",
"# - **Yes:** Indicates the individual frequently consumes high-caloric food.\n",
"# - **No:** Indicates the individual does not frequently consume high-caloric food.\n",
"# 7. **FCVC (Frequency of consuming vegetables):**\n",
"# - Ranges from approximately 1.0 to 3.0: Represents the frequency of consuming vegetables.\n",
"# 8. **CAEC (Consumption of food between meals):**\n",
"# - **Always:** Indicates the individual always consumes food between meals.\n",
"# - **Frequently:** Indicates the individual frequently consumes food between meals.\n",
"# - **Sometimes:** Indicates the individual sometimes consumes food between meals.\n",
"# - **No:** Indicates the individual does not consume food between meals.\n",
"# 9. **SMOKE (Smoking habit):**\n",
"# - **Yes:** Indicates the individual smokes.\n",
"# - **No:** Indicates the individual does not smoke.\n",
"# 10. **CH2O (Consumption of water daily):**\n",
"# - Ranges from approximately 1.0 to 3.0 liters: Represents the daily consumption of water in liters.\n",
"# 11. **FAF (Physical activity frequency):**\n",
"# - Ranges from approximately 0.0 to 3.0: Represents the frequency of physical activity.\n",
"# 12. **SCC (Calories consumption monitoring):**\n",
"# - **Yes:** Indicates the individual monitors their calorie consumption.\n",
"# - **No:** Indicates the individual does not monitor their calorie consumption.\n",
"# 13. **TUE (Time using technology devices):**\n",
"# - Ranges from approximately 0.0 to 16.0 hours: Represents the time spent using technology devices in hours.\n",
"# 14. **CALC (Alcohol consumption):**\n",
"# - **Sometimes:** Indicates the individual sometimes consumes alcohol.\n",
"# - **Frequently:** Indicates the individual frequently consumes alcohol.\n",
"# - **Always:** Indicates the individual always consumes alcohol.\n",
"# - **No:** Indicates the individual does not consume alcohol.\n",
"# 15. **MTRANS (Transportation used):**\n",
"# - **Automobile:** Indicates the individual uses automobile for transportation.\n",
"# - **Bike:** Indicates the individual uses a bike for transportation.\n",
"# - **Motorbike:** Indicates the individual uses a motorbike for transportation.\n",
"# - **Public_Transportation:** Indicates the individual uses public transportation.\n",
"# - **Walking:** Indicates the individual prefers walking as a mode of transportation.\n",
"# 16. **NObeyesdad (Obesity class):**\n",
"# - **No_obesity:** Indicates the individual does not suffer from obesity.\n",
"# - **Obesity_Type_I:** Indicates the individual belongs to obesity type I class.\n",
"# - **Obesity_Type_II:** Indicates the individual belongs to obesity type II class.\n",
"# - **Obesity_Type_III:** Indicates the individual belongs to obesity type III class.\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">3. The count of unique value in the NObeyesdad column:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.368984Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.369334Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.403458Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.369305Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.402144Z\"}}\n",
"df_train.groupby('NObeyesdad').count().iloc[:,1]\n",
"\n",
"# %% [markdown]\n",
"# - There are 2523 individuals categorized as \"Insufficient_Weight\".\n",
"# - There are 3082 individuals categorized as \"Normal_Weight\".\n",
"# - There are 2910 individuals categorized as \"Obesity_Type_I\".\n",
"# - There are 3248 individuals categorized as \"Obesity_Type_II\".\n",
"# - There are 4046 individuals categorized as \"Obesity_Type_III\".\n",
"# - There are 2427 individuals categorized as \"Overweight_Level_I\".\n",
"# - There are 2522 individuals categorized as \"Overweight_Level_II\".\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">4. Categorical and numerical Variables Analysis:</span>\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">a. Extracting column names for categorical, numerical, and categorical but cardinal variables:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.405016Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.405491Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.463911Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.405460Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.462713Z\"}}\n",
"# Function to extract column names for categorical, numerical, and categorical but cardinal variables\n",
"\n",
"def extract_column_names(dataframe, cat_threshold=10, car_threshold=20):\n",
" \"\"\"This function extracts the names of categorical, numerical, and categorical but cardinal variables from a given dataframe.\n",
"\n",
" Args:\n",
" -------\n",
" dataframe (pandas.DataFrame): The input dataframe containing all the data.\n",
" cat_threshold (int, float, optional): The threshold value for considering a numerical variable as categorical. Defaults to 10.\n",
" car_threshold (int, float, optional): The threshold value for considering a categorical variable as cardinal. Defaults to 20.\n",
"\n",
" Returns:\n",
" -------\n",
" categorical_columns: List\n",
" List of categorical variable names.\n",
"\n",
" numerical_columns: List\n",
" List of numerical variable names.\n",
"\n",
" categorical_but_cardinal: List\n",
" List of variable names that appear categorical but are actually cardinal.\n",
"\n",
" Notes:\n",
" -------\n",
" The sum of categorical_columns, numerical_columns, and categorical_but_cardinal equals the total number of variables.\n",
" numerical_but_categorical are included in categorical_columns.\n",
" The sum of the three returned lists is equal to the total number of variables in the dataframe.\n",
"\n",
" \"\"\"\n",
"\n",
" # Extract categorical columns and those that seem numerical but are categorical\n",
" categorical_columns = [\n",
" col\n",
" for col in dataframe.columns\n",
" if str(dataframe[col].dtypes) in [\"object\", \"category\", \"bool\"]\n",
" ]\n",
"\n",
" numerical_but_categorical = [\n",
" col\n",
" for col in dataframe.columns\n",
" if dataframe[col].nunique() < cat_threshold\n",
" and dataframe[col].dtypes in [\"int64\", \"float64\"]\n",
" ]\n",
"\n",
" # Extract columns that appear categorical but are actually cardinal\n",
" categorical_but_cardinal = [\n",
" col\n",
" for col in dataframe.columns\n",
" if dataframe[col].nunique() > car_threshold\n",
" and str(dataframe[col].dtypes) in [\"object\", \"category\"]\n",
" ]\n",
"\n",
" # Exclude numerical_but_categorical from categorical_columns\n",
" categorical_columns = categorical_columns + numerical_but_categorical\n",
" categorical_columns = [col for col in categorical_columns if col not in categorical_but_cardinal]\n",
"\n",
" # Extract numerical columns\n",
" numerical_columns = [\n",
" col\n",
" for col in dataframe.columns\n",
" if dataframe[col].dtypes in [\"int64\", \"float64\"] and col not in categorical_columns\n",
" ]\n",
"\n",
" # Print summary statistics\n",
" print(f\"Observations: {dataframe.shape[0]}\")\n",
" print(f\"Variables: {dataframe.shape[1]}\")\n",
" print(f\"Categorical columns: {len(categorical_columns)}\")\n",
" print(f\"Numerical columns: {len(numerical_columns)}\")\n",
" print(f\"Categorical but cardinal columns: {len(categorical_but_cardinal)}\")\n",
" print(f\"Numerical but categorical columns: {len(numerical_but_categorical)}\")\n",
"\n",
" return categorical_columns, numerical_columns, categorical_but_cardinal\n",
"\n",
"\n",
"# Extract column names from the 'df_train' dataframe\n",
"categorical_cols, numerical_cols, categorical_but_cardinal = extract_column_names(df_train)\n",
"\n",
"# %% [markdown]\n",
"# - **Observations**: 20,758 rows in the dataset.\n",
"# - **Variables**: Total of 18 features.\n",
"# - **Categorical columns**: 9 variables are categorical.\n",
"# - **Numerical columns**: 9 variables are numerical.\n",
"# - **Categorical but cardinal columns**: No categorical variables with many unique values.\n",
"# - **Numerical but categorical columns**: No numerical variables with few unique values.\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.465881Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.466356Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.473206Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.466314Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.471862Z\"}}\n",
"print(\"Numerical columns:\\n\", numerical_cols)\n",
"print(\"Categorical columns:\\n\", categorical_cols)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">b. Summary Of All Categorical Variables:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.474289Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.474626Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.569199Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.474597Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.567922Z\"}}\n",
"def variable_summary(data_frame):\n",
" # Initialize the summaries list\n",
" summaries = []\n",
"\n",
" # Loop through each categorical variable\n",
" for col in data_frame.select_dtypes(include=['object', 'category']):\n",
" # Summary of unique values\n",
" unique_values = data_frame[col].unique()\n",
" unique_count = data_frame[col].nunique()\n",
" summaries.append(Fore.BLUE + f\"Summary of {col}:\" + Style.RESET_ALL)\n",
" summaries.append(f\"Unique values of {col}: {unique_values} is {unique_count}.\\n\")\n",
"\n",
" # Percentage summary\n",
" total_count = len(data_frame[col])\n",
" percentage_data = []\n",
" for i, (value, count) in enumerate(data_frame[col].value_counts().head(10).items(), start=1):\n",
" ratio = (count / total_count) * 100\n",
" percentage_data.append([i, value, count, f\"{ratio:.2f}%\"])\n",
" percentage_headers = [Fore.GREEN + \"Index\", \"Value\", \"Count\", \"Percentage\" + Style.RESET_ALL]\n",
" percentage_table = tabulate(percentage_data, headers=percentage_headers, tablefmt=\"fancy_grid\")\n",
"\n",
" # Append the percentage table to the summaries list\n",
" summaries.append(percentage_table)\n",
" summaries.append('\\n')\n",
"\n",
" # Print the summaries\n",
" print('\\n'.join(summaries))\n",
"\n",
"# Assuming your dataframe is named 'df_train'\n",
"print(Fore.BLUE+\"################# Summary of Categorical variables:############################\")\n",
"variable_summary(df_train)\n",
"\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">c. Summary Of All Numerical Variables:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.571024Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.571486Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.633397Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.571441Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.632027Z\"}}\n",
"from tabulate import tabulate\n",
"from colorama import Fore, Style\n",
"\n",
"def variable_summary(data_frame):\n",
" # Summaries of numerical variables\n",
" num_summaries = []\n",
" for col in data_frame.select_dtypes(include=['int64', 'float64']):\n",
" unique_count = data_frame[col].nunique()\n",
" num_summaries.append(Fore.BLUE + f\"Summary of {col}:\" + Style.RESET_ALL)\n",
" num_summaries.append(f\"Unique values of {col}: is {unique_count}.\\n\")\n",
" summary = data_frame[col].describe().reset_index()\n",
" summary.columns = [Fore.RED +\"Statistic\", col + Style.RESET_ALL]\n",
" num_summaries.append(tabulate(summary, headers=\"keys\", tablefmt=\"fancy_grid\"))\n",
"\n",
" print(Fore.BLUE + \"##################### Summaries of numerical variables ####################\")\n",
" print(Style.RESET_ALL)\n",
" print(\"\\n\".join(num_summaries))\n",
"\n",
"# Assuming your dataframe is named 'df_train'\n",
"variable_summary(df_train)\n",
"\n",
"# %% [markdown]\n",
"# # Section: 4. Data Preprocessing:\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">1. Typeconversion of dataframe:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.634794Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.635160Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.646379Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.635130Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.645085Z\"}}\n",
"# Define a function to convert column datatype to integer\n",
"def convert_column_datatype(df, column_name):\n",
" \"\"\"\n",
" Convert the data type of a specified column in the dataframe to integer.\n",
"\n",
" Parameters:\n",
" df (DataFrame): The dataframe containing the column to be converted.\n",
" column_name (str): The name of the column to be converted.\n",
"\n",
" Returns:\n",
" DataFrame: The dataframe with the specified column converted to integer data type.\n",
" \"\"\"\n",
" df[column_name] = df[column_name].astype('int32')\n",
" return df\n",
"\n",
"# Example usage:\n",
"df_train = convert_column_datatype(df_train, 'Age')\n",
"df_train = convert_column_datatype(df_train, 'Weight')\n",
"\n",
"# Example usage:\n",
"test_sub = convert_column_datatype(test_sub, 'Age')\n",
"test_sub = convert_column_datatype(test_sub, 'Weight')\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">2. Renaming the Columns:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.650244Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.650705Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.659721Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.650670Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.658148Z\"}}\n",
"new_column_names = {\n",
" 'Gender': 'Gender',\n",
" 'Age': 'Age',\n",
" 'Height': 'Height',\n",
" 'Weight': 'Weight',\n",
" 'family_history_with_overweight': 'Overweighted Family History',\n",
" 'FAVC': 'High caleric food consp',\n",
" 'FCVC': 'veg consp',\n",
" 'NCP': 'main meal consp',\n",
" 'CAEC': 'Food btw meal consp',\n",
" 'SMOKE': 'SMOKE',\n",
" 'CH2O': 'Water consp',\n",
" 'SCC': 'Calories Monitoring',\n",
" 'FAF': 'physical actv',\n",
" 'TUE': 'Screentime',\n",
" 'CALC': 'Alcohol consp',\n",
" 'MTRANS': 'transport used',\n",
" 'NObeyesdad': 'Obesity_Level'\n",
"}\n",
"\n",
"# Rename the columns for train data\n",
"df_train.rename(columns=new_column_names, inplace=True)\n",
"\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.661995Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.662502Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.695402Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.662457Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.693806Z\"}}\n",
"df_train.head(5)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.696963Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.697439Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.723821Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.697396Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.722217Z\"}}\n",
"test_sub.head(5)\n",
"\n",
"# %% [markdown] {\"execution\":{\"iopub.status.busy\":\"2024-02-11T20:35:59.645467Z\",\"iopub.execute_input\":\"2024-02-11T20:35:59.645800Z\",\"iopub.status.idle\":\"2024-02-11T20:35:59.650893Z\",\"shell.execute_reply.started\":\"2024-02-11T20:35:59.645775Z\",\"shell.execute_reply\":\"2024-02-11T20:35:59.650105Z\"}}\n",
"# # <span style=\"color:blue\">3. Detecting Columns with Large or Infinite Values:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.725173Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.726697Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.738585Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.726598Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.737221Z\"}}\n",
"def columns_with_infinite_values(df):\n",
" numeric_df = df.select_dtypes(include=[np.number]) # Select only numeric columns\n",
" inf_values = np.isinf(numeric_df)\n",
" columns_with_inf = numeric_df.columns[np.any(inf_values, axis=0)]\n",
" return columns_with_inf\n",
"\n",
"print(\"Columns with infinite values:\\n\", columns_with_infinite_values(df_train))\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.740231Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.740752Z\",\"iopub.status.idle\":\"2024-02-28T14:37:01.752186Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.740708Z\",\"shell.execute_reply\":\"2024-02-28T14:37:01.751104Z\"}}\n",
"def columns_with_large_numbers(df):\n",
" numeric_df = df.select_dtypes(include=[np.number]) # Select only numeric columns\n",
" large_values = np.abs(numeric_df) > 1e15\n",
" columns_with_large = numeric_df.columns[np.any(large_values, axis=0)]\n",
" return columns_with_large\n",
"\n",
"print(\"Columns with large values:\\n\", columns_with_large_numbers(df_train))\n",
"\n",
"# %% [markdown]\n",
"# This output indicates that there are no columns in the dataset with infinite or large values.\n",
"\n",
"# %% [markdown]\n",
"# # Section:5. Exploratory Data Analysis and Visualisation-EDAV:\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">1. Univariate Analysis:</span>\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">a. Countplots for all Variables:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:37:01.753743Z\",\"iopub.execute_input\":\"2024-02-28T14:37:01.755318Z\",\"iopub.status.idle\":\"2024-02-28T14:46:08.371504Z\",\"shell.execute_reply.started\":\"2024-02-28T14:37:01.755270Z\",\"shell.execute_reply\":\"2024-02-28T14:46:08.370218Z\"}}\n",
"plt.figure(figsize=(30, 25))\n",
"plt.suptitle('Countplots for all Variables', fontsize=24, fontweight='bold')\n",
"\n",
"# Get the list of column names from the dataframe\n",
"columns = df_train.columns\n",
"\n",
"# Determine the number of rows and columns for subplots\n",
"num_rows = (len(columns) + 2) // 3 # Add 2 to round up to the nearest multiple of 3\n",
"num_cols = 3\n",
"\n",
"# Create countplots for each variable\n",
"for i, col in enumerate(columns, start=1):\n",
" ax = plt.subplot(num_rows, num_cols, i)\n",
" sns.countplot(x=df_train[col], palette='viridis') # Add color palette for better visualization\n",
" ax.set_title(f'Countplot of {col}', fontsize=18, pad=20, fontweight='bold')\n",
" plt.xlabel(col, fontsize=14, fontweight='bold') # Add bold fontweight to x-axis label\n",
" plt.ylabel('Count', fontsize=14, fontweight='bold') # Add bold fontweight to y-axis label\n",
" plt.grid(True, linestyle='--', alpha=0.5)\n",
" # Add count indicators on top of each bar\n",
" for p in ax.patches:\n",
" height = p.get_height()\n",
" ax.annotate(f'{height}', (p.get_x() + p.get_width() / 2., height), ha='center', va='bottom', fontsize=8, fontweight='bold')\n",
"\n",
"plt.tight_layout(rect=[0, 0.03, 1, 0.95])\n",
"plt.show()\n",
"\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">b. Analyzing Individual Variables Using Histogram:</span>\n",
"#\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:08.372770Z\",\"iopub.execute_input\":\"2024-02-28T14:46:08.373677Z\",\"iopub.status.idle\":\"2024-02-28T14:46:13.251382Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:08.373641Z\",\"shell.execute_reply\":\"2024-02-28T14:46:13.250184Z\"}}\n",
"plt.figure(figsize=(18, 14))\n",
"plt.suptitle('Analyzing Individual Variables', fontsize=20)\n",
"\n",
"# Age\n",
"plt.subplot(3, 3, 1)\n",
"sns.histplot(df_train['Age'], kde=True, bins=15, color='skyblue')\n",
"plt.title('Distribution of Age', fontsize=16)\n",
"plt.xlabel('Age', fontsize=14)\n",
"plt.ylabel('Frequency', fontsize=14)\n",
"plt.grid(True, linestyle='--', alpha=0.5)\n",
"mean_age = df_train['Age'].mean()\n",
"median_age = df_train['Age'].median()\n",
"plt.axvline(x=mean_age, color='red', linestyle='--', label=f'Mean: {mean_age:.2f}')\n",
"plt.axvline(x=median_age, color='green', linestyle='--', label=f'Median: {median_age:.2f}')\n",
"plt.legend()\n",
"\n",
"# Height\n",
"plt.subplot(3, 3, 2)\n",
"sns.histplot(df_train['Height'], kde=True, bins=15, color='salmon')\n",
"plt.title('Distribution of Height', fontsize=16)\n",
"plt.xlabel('Height', fontsize=14)\n",
"plt.ylabel('Frequency', fontsize=14)\n",
"plt.grid(True, linestyle='--', alpha=0.5)\n",
"mean_height = df_train['Height'].mean()\n",
"median_height = df_train['Height'].median()\n",
"plt.axvline(x=mean_height, color='red', linestyle='--', label=f'Mean: {mean_height:.2f}')\n",
"plt.axvline(x=median_height, color='green', linestyle='--', label=f'Median: {median_height:.2f}')\n",
"plt.legend()\n",
"\n",
"# Weight\n",
"plt.subplot(3, 3, 3)\n",
"sns.histplot(df_train['Weight'], kde=True, bins=15, color='lightgreen')\n",
"plt.title('Distribution of Weight', fontsize=16)\n",
"plt.xlabel('Weight', fontsize=14)\n",
"plt.ylabel('Frequency', fontsize=14)\n",
"plt.grid(True, linestyle='--', alpha=0.5)\n",
"mean_weight = df_train['Weight'].mean()\n",
"median_weight = df_train['Weight'].median()\n",
"plt.axvline(x=mean_weight, color='red', linestyle='--', label=f'Mean: {mean_weight:.2f}')\n",
"plt.axvline(x=median_weight, color='green', linestyle='--', label=f'Median: {median_weight:.2f}')\n",
"plt.legend()\n",
"\n",
"# Screentime\n",
"plt.subplot(3, 3, 4)\n",
"sns.histplot(df_train['Screentime'], kde=True, bins=15, color='orange')\n",
"plt.title('Distribution of Screentime', fontsize=16)\n",
"plt.xlabel('Screentime', fontsize=14)\n",
"plt.ylabel('Frequency', fontsize=14)\n",
"plt.grid(True, linestyle='--', alpha=0.5)\n",
"mean_screentime = df_train['Screentime'].mean()\n",
"median_screentime = df_train['Screentime'].median()\n",
"plt.axvline(x=mean_screentime, color='red', linestyle='--', label=f'Mean: {mean_screentime:.2f}')\n",
"plt.axvline(x=median_screentime, color='green', linestyle='--', label=f'Median: {median_screentime:.2f}')\n",
"plt.legend()\n",
"\n",
"# Alcohol consumption\n",
"plt.subplot(3, 3, 5)\n",
"sns.histplot(df_train['Alcohol consp'], kde=True, bins=15, color='lightcoral')\n",
"plt.title('Distribution of Alcohol Consumption', fontsize=16)\n",
"plt.xlabel('Alcohol Consumption', fontsize=14)\n",
"plt.ylabel('Frequency', fontsize=14)\n",
"plt.grid(True, linestyle='--', alpha=0.5)\n",
"mode_AlcoholConsp = df_train['Alcohol consp'].mode()[0]\n",
"plt.text(0.5, 0.5, f'Mode: {mode_AlcoholConsp}', horizontalalignment='center', verticalalignment='center', transform=plt.gca().transAxes)\n",
"\n",
"# Transportation used\n",
"plt.subplot(3, 3, 6)\n",
"sns.histplot(df_train['transport used'], kde=True, bins=15, color='lightblue')\n",
"plt.title('Distribution of Transportation Used', fontsize=16)\n",
"plt.xlabel('Transportation', fontsize=14)\n",
"plt.ylabel('Frequency', fontsize=14)\n",
"plt.grid(True, linestyle='--', alpha=0.5)\n",
"mode_transportation = df_train['transport used'].mode()[0]\n",
"plt.text(0.5, 0.5, f'Mode: {mode_transportation}', horizontalalignment='center', verticalalignment='center', transform=plt.gca().transAxes)\n",
"\n",
"# Main Meal Consumption\n",
"plt.subplot(3, 3, 7)\n",
"sns.histplot(df_train['main meal consp'], kde=True, bins=15, color='lightgrey')\n",
"plt.title('Distribution of Main Meal Consumption', fontsize=16)\n",
"plt.xlabel('Main Meal Consumption', fontsize=14)\n",
"plt.ylabel('Frequency', fontsize=14)\n",
"plt.grid(True, linestyle='--', alpha=0.5)\n",
"mean_main_meal_consp = df_train['main meal consp'].mean()\n",
"median_main_meal_consp = df_train['main meal consp'].median()\n",
"plt.axvline(x=mean_main_meal_consp, color='red', linestyle='--', label=f'Mean: {mean_main_meal_consp:.2f}')\n",
"plt.axvline(x=median_main_meal_consp, color='green', linestyle='--', label=f'Median: {median_main_meal_consp:.2f}')\n",
"plt.legend()\n",
"\n",
"# Water consumption\n",
"plt.subplot(3, 3, 8)\n",
"sns.histplot(df_train['Water consp'], kde=True, bins=15, color='lightcoral')\n",
"plt.title('Distribution of Water Consumption', fontsize=16)\n",
"plt.xlabel('Water Consumption', fontsize=14)\n",
"plt.ylabel('Frequency', fontsize=14)\n",
"plt.grid(True, linestyle='--', alpha=0.5)\n",
"mean_water_consp = df_train['Water consp'].mean()\n",
"median_water_consp = df_train['Water consp'].median()\n",
"plt.axvline(x=mean_water_consp, color='red', linestyle='--', label=f'Mean: {mean_water_consp:.2f}')\n",
"plt.axvline(x=median_water_consp, color='green', linestyle='--', label=f'Median: {median_water_consp:.2f}')\n",
"plt.legend()\n",
"\n",
"# Physical activity\n",
"plt.subplot(3, 3, 9)\n",
"sns.histplot(df_train['physical actv'], kde=True, bins=15, color='lightblue')\n",
"plt.title('Distribution of Physical Activity', fontsize=16)\n",
"plt.xlabel('Physical Activity', fontsize=14)\n",
"plt.ylabel('Frequency', fontsize=14)\n",
"plt.grid(True, linestyle='--', alpha=0.5)\n",
"mean_physical_actv = df_train['physical actv'].mean()\n",
"median_physical_actv = df_train['physical actv'].median()\n",
"plt.axvline(x=mean_physical_actv, color='red', linestyle='--', label=f'Mean: {mean_physical_actv:.2f}')\n",
"plt.axvline(x=median_physical_actv, color='green', linestyle='--', label=f'Median: {median_physical_actv:.2f}')\n",
"plt.legend()\n",
"\n",
"plt.tight_layout(rect=[0, 0.03, 1, 0.95])\n",
"plt.show()\n",
"\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">c. KDE Plots of Numerical Columns:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:13.253314Z\",\"iopub.execute_input\":\"2024-02-28T14:46:13.253786Z\",\"iopub.status.idle\":\"2024-02-28T14:46:16.820788Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:13.253747Z\",\"shell.execute_reply\":\"2024-02-28T14:46:16.819668Z\"}}\n",
"# Define numerical_cols\n",
"numerical_cols = df_train.select_dtypes(include=['float64', 'int64']).columns\n",
"\n",
"# Function to plot KDE density for numerical columns in three plots per row\n",
"def plot_kde_density(df):\n",
" num_plots = len(numerical_cols)\n",
" num_rows = (num_plots + 2) // 2 # Calculate number of rows required\n",
"\n",
" fig, axes = plt.subplots(num_rows, 2, figsize=(20, 5*num_rows))\n",
" fig.suptitle('KDE Plots of Numerical Columns', fontsize=20)\n",
"\n",
" for i, col in enumerate(numerical_cols):\n",
" row = i // 2\n",
" col_idx = i % 2\n",
" ax = axes[row, col_idx]\n",
" sns.kdeplot(data=df[col], fill=True, color='skyblue', ax=ax)\n",
" ax.set_xlabel(col)\n",
" ax.set_ylabel('Density')\n",
" ax.set_title(f'KDE Plot of {col}')\n",
"\n",
" # Add mean and standard deviation information\n",
" mean = df[col].mean()\n",
" std_dev = df[col].std()\n",
" ax.axvline(x=mean, linestyle='--', color='red', label=f'Mean: {mean:.2f}')\n",
" ax.axvline(x=mean - std_dev, linestyle='--', color='green', label=f'Std Dev: {std_dev:.2f}')\n",
" ax.axvline(x=mean + std_dev, linestyle='--', color='green')\n",
" ax.legend()\n",
"\n",
" # Add grid lines for better visualization\n",
" ax.grid(True, linestyle='--', alpha=0.5)\n",
"\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
"# Call the function to plot KDE density for numerical columns in df_train\n",
"plot_kde_density(df_train)\n",
"\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">d. Pie Chart and Barplot for categorical variables:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:16.823018Z\",\"iopub.execute_input\":\"2024-02-28T14:46:16.823793Z\",\"iopub.status.idle\":\"2024-02-28T14:46:20.768252Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:16.823750Z\",\"shell.execute_reply\":\"2024-02-28T14:46:20.767323Z\"}}\n",
"def plot_data(df):\n",
" \"\"\"\n",
" Plot different types of plots for each categorical column in the DataFrame.\n",
"\n",
" Parameters:\n",
" df (DataFrame): The input DataFrame containing categorical columns.\n",
"\n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" # Selecting categorical columns\n",
" categorical_cols = df.select_dtypes(include=['object']).columns\n",
"\n",
" # Create subplots\n",
" fig, axes = plt.subplots(len(categorical_cols), 2, figsize=(14, 7*len(categorical_cols)))\n",
"\n",
" # Plotting pie chart for each categorical variable in the first column\n",
" for i, col in enumerate(categorical_cols):\n",
" ax = axes[i, 0]\n",
" value_counts = df[col].value_counts()\n",
" ax.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', startangle=90)\n",
" ax.set_title(f'Distribution of {col}')\n",
" ax.set_ylabel('')\n",
" ax.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle\n",
" ax.annotate(f'Total Count: {len(df[col])}', xy=(0, 0), fontsize=10, ha=\"center\")\n",
"\n",
" # Plotting bar plot for each categorical variable in the second column\n",
" for i, col in enumerate(categorical_cols):\n",
" ax = axes[i, 1]\n",
" value_counts = df[col].value_counts()\n",
" sns.barplot(x=value_counts.index, y=value_counts, ax=ax)\n",
" ax.set_title(f'Count of {col}')\n",
" ax.set_xlabel(f'{col}')\n",
" ax.set_ylabel('Count')\n",
" ax.tick_params(axis='x', rotation=45) # Rotate x-axis labels for better readability\n",
" for patch in ax.patches:\n",
" ax.annotate(f'{patch.get_height()}', (patch.get_x() + patch.get_width() / 2., patch.get_height()),\n",
" ha='center', va='center', fontsize=10, color='black', xytext=(0, 5),\n",
" textcoords='offset points')\n",
"\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
"# Call the function to plot different types of plots for df_train\n",
"plot_data(df_train)\n",
"\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">e. Violin Plot and Box Plot for Numerical variables:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:20.769735Z\",\"iopub.execute_input\":\"2024-02-28T14:46:20.770108Z\",\"iopub.status.idle\":\"2024-02-28T14:46:25.579548Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:20.770076Z\",\"shell.execute_reply\":\"2024-02-28T14:46:25.578318Z\"}}\n",
"def plot_data(df):\n",
" \"\"\"\n",
" Plot different types of plots for each column in the DataFrame.\n",
"\n",
" Parameters:\n",
" df (DataFrame): The input DataFrame.\n",
"\n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns\n",
"\n",
" # Create subplots\n",
" fig, axes = plt.subplots(len(numerical_cols), 2, figsize=(14, 7*len(numerical_cols)))\n",
"\n",
" # Plotting violin plot for each numerical variable in the first column\n",
" for i, col in enumerate(numerical_cols):\n",
" ax = axes[i, 0]\n",
" sns.violinplot(data=df[col], ax=ax, color='blue')\n",
" ax.set_title(f'Violin Plot of {col}')\n",
" ax.set_xlabel('')\n",
" ax.set_ylabel('Value')\n",
" # Add statistical information\n",
" mean = df[col].mean()\n",
" median = df[col].median()\n",
" ax.axhline(y=mean, color='red', linestyle='--', label=f'Mean: {mean:.2f}')\n",
" ax.axhline(y=median, color='green', linestyle='--', label=f'Median: {median:.2f}')\n",
" ax.legend()\n",
"\n",
" # Plotting box plot for each numerical variable in the second column\n",
" for i, col in enumerate(numerical_cols):\n",
" ax = axes[i, 1]\n",
" sns.boxplot(data=df, y=col, ax=ax, showfliers=False)\n",
" ax.set_title(f'Distribution of {col}')\n",
" ax.set_ylabel(f'{col}')\n",
" ax.set_xlabel('') # Remove x-axis label as it represents 'Level' which is not available\n",
" # Add statistical information\n",
" q1 = df[col].quantile(0.25)\n",
" q3 = df[col].quantile(0.75)\n",
" iqr = q3 - q1\n",
" ax.axhline(y=q1, color='blue', linestyle='--', label=f'Q1: {q1:.2f}')\n",
" ax.axhline(y=q3, color='purple', linestyle='--', label=f'Q3: {q3:.2f}')\n",
" ax.axhline(y=q1 - 1.5 * iqr, color='orange', linestyle='--', label=f'Lower Bound')\n",
" ax.axhline(y=q3 + 1.5 * iqr, color='orange', linestyle='--', label=f'Upper Bound')\n",
" ax.legend()\n",
"\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
"# Call the function to plot different types of plots for df_train\n",
"plot_data(df_train)\n",
"\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">2. Bivariate Analysis:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:25.590177Z\",\"iopub.execute_input\":\"2024-02-28T14:46:25.590612Z\",\"iopub.status.idle\":\"2024-02-28T14:46:25.606277Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:25.590576Z\",\"shell.execute_reply\":\"2024-02-28T14:46:25.605041Z\"}}\n",
"def plot_scatter_relationship(col1, col2, target=None, data=None):\n",
" plt.figure(figsize=(10, 12))\n",
"\n",
" # Plotting the scatter plot\n",
" sns.scatterplot(data=data, x=col1, y=col2, hue=target, palette='viridis', alpha=0.5)\n",
"\n",
" # Calculating correlation coefficient\n",
" corr_coef, _ = pearsonr(data[col1], data[col2])\n",
"\n",
" # Adding regression lines\n",
" sns.regplot(data=data, x=col1, y=col2, scatter=False, color='black')\n",
"\n",
" # Adding statistical summary\n",
" plt.text(data[col1].min(), data[col2].max(), f'Correlation coefficient: {corr_coef:.2f}', fontsize=10)\n",
" plt.text(data[col1].min(), data[col2].max() - 0.03 * (data[col2].max() - data[col2].min()), f'Mean {col1}: {data[col1].mean():.2f}', fontsize=10)\n",
" plt.text(data[col1].min(), data[col2].max() - 0.06 * (data[col2].max() - data[col2].min()), f'Mean {col2}: {data[col2].mean():.2f}', fontsize=10)\n",
" plt.text(data[col1].min(), data[col2].max() - 0.09 * (data[col2].max() - data[col2].min()), f'Std {col1}: {data[col1].std():.2f}', fontsize=10)\n",
" plt.text(data[col1].min(), data[col2].max() - 0.12* (data[col2].max() - data[col2].min()), f'Std {col2}: {data[col2].std():.2f}', fontsize=10)\n",
"\n",
" plt.xlabel(col1)\n",
" plt.ylabel(col2)\n",
" plt.title(f'Scatter Plot: {col1} vs {col2} with {target}')\n",
" plt.grid(True)\n",
" plt.legend()\n",
" plt.show()\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">a. Scatter plot: AGE V/s Weight with Obesity Level:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:25.607852Z\",\"iopub.execute_input\":\"2024-02-28T14:46:25.608331Z\",\"iopub.status.idle\":\"2024-02-28T14:46:28.433907Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:25.608291Z\",\"shell.execute_reply\":\"2024-02-28T14:46:28.432760Z\"}}\n",
"plot_scatter_relationship('Age','Weight','Obesity_Level', df_train)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">b. Scatter plot: AGE V/s Height with Obesity Level:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:28.435378Z\",\"iopub.execute_input\":\"2024-02-28T14:46:28.435791Z\",\"iopub.status.idle\":\"2024-02-28T14:46:31.174456Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:28.435752Z\",\"shell.execute_reply\":\"2024-02-28T14:46:31.173208Z\"}}\n",
"plot_scatter_relationship('Age','Height','Obesity_Level',df_train)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">c. Scatter plot: Height V/s Weight with Obesity Level:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:31.175910Z\",\"iopub.execute_input\":\"2024-02-28T14:46:31.176281Z\",\"iopub.status.idle\":\"2024-02-28T14:46:33.899668Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:31.176252Z\",\"shell.execute_reply\":\"2024-02-28T14:46:33.898771Z\"}}\n",
"plot_scatter_relationship('Height','Weight','Obesity_Level',df_train)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">d. Scatter plot: AGE V/s Weight with Overweighted Family History:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:33.900814Z\",\"iopub.execute_input\":\"2024-02-28T14:46:33.901663Z\",\"iopub.status.idle\":\"2024-02-28T14:46:36.470326Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:33.901630Z\",\"shell.execute_reply\":\"2024-02-28T14:46:36.469276Z\"}}\n",
"plot_scatter_relationship('Age','Weight','Overweighted Family History',df_train)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">e. Scatter plot: AGE V/s height with Overweighted Family History:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:36.472004Z\",\"iopub.execute_input\":\"2024-02-28T14:46:36.472645Z\",\"iopub.status.idle\":\"2024-02-28T14:46:39.007146Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:36.472606Z\",\"shell.execute_reply\":\"2024-02-28T14:46:39.006027Z\"}}\n",
"plot_scatter_relationship('Age','Height','Overweighted Family History',df_train)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">f. Scatter plot: Height V/s Weight with Overweighted Family History:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:39.008775Z\",\"iopub.execute_input\":\"2024-02-28T14:46:39.009451Z\",\"iopub.status.idle\":\"2024-02-28T14:46:41.607125Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:39.009413Z\",\"shell.execute_reply\":\"2024-02-28T14:46:41.606129Z\"}}\n",
"plot_scatter_relationship('Height','Weight','Overweighted Family History',df_train)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">g. Scatter plot: AGE V/s Weight with Transport use:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:41.608491Z\",\"iopub.execute_input\":\"2024-02-28T14:46:41.608820Z\",\"iopub.status.idle\":\"2024-02-28T14:46:44.213685Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:41.608791Z\",\"shell.execute_reply\":\"2024-02-28T14:46:44.212201Z\"}}\n",
"plot_scatter_relationship('Age','Weight','transport used',df_train)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">h. Scatter plot: AGE V/s Height with Transport use:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:44.215301Z\",\"iopub.execute_input\":\"2024-02-28T14:46:44.215677Z\",\"iopub.status.idle\":\"2024-02-28T14:46:46.843035Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:44.215646Z\",\"shell.execute_reply\":\"2024-02-28T14:46:46.841761Z\"}}\n",
"plot_scatter_relationship('Age','Height','transport used',df_train)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">i. Scatter plot: Height V/s Weight with Transport use:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:46.844851Z\",\"iopub.execute_input\":\"2024-02-28T14:46:46.845237Z\",\"iopub.status.idle\":\"2024-02-28T14:46:49.558182Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:46.845206Z\",\"shell.execute_reply\":\"2024-02-28T14:46:49.557002Z\"}}\n",
"plot_scatter_relationship('Height','Weight','transport used',df_train)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">3. Multivariate Analysis:</span>\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">a. Pair Plot of Variables against Obesity Levels:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:46:49.559427Z\",\"iopub.execute_input\":\"2024-02-28T14:46:49.559759Z\",\"iopub.status.idle\":\"2024-02-28T14:48:42.059556Z\",\"shell.execute_reply.started\":\"2024-02-28T14:46:49.559730Z\",\"shell.execute_reply\":\"2024-02-28T14:48:42.057808Z\"}}\n",
"# Selecting numerical columns for pairplot\n",
"numerical_columns = ['Age', 'Height', 'Weight', 'High caleric food consp', 'veg consp', 'main meal consp',\n",
" 'Food btw meal consp', 'Water consp', 'Calories Monitoring', 'physical actv', 'Screentime',\n",
" 'Alcohol consp']\n",
"\n",
"# Add the target variable 'Obesity_Level' for hue\n",
"df_train['Obesity_Level'] = df_train['Obesity_Level'].astype('category')\n",
"\n",
"# Create pair plot\n",
"pair_plot = sns.pairplot(df_train[numerical_columns + ['Obesity_Level']], hue='Obesity_Level', palette='deep', diag_kind='kde')\n",
"\n",
"# Add title to the plot\n",
"pair_plot.fig.suptitle('Pair Plot of Variables against Obesity Levels', fontsize=16, y=1.02)\n",
"\n",
"# Display the plot\n",
"plt.show()\n",
"\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">b. Correlation heatmap for Pearson's correlation coefficient:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:48:42.061372Z\",\"iopub.execute_input\":\"2024-02-28T14:48:42.061966Z\",\"iopub.status.idle\":\"2024-02-28T14:48:46.925113Z\",\"shell.execute_reply.started\":\"2024-02-28T14:48:42.061893Z\",\"shell.execute_reply\":\"2024-02-28T14:48:46.924233Z\"}}\n",
"def plot_correlation_heatmap(df, method='pearson'):\n",
" # Calculate the correlation matrix\n",
" corr_matrix = df.corr(method=method)\n",
"\n",
" # Plot the heatmap\n",
" plt.figure(figsize=(30, 20))\n",
" sns.heatmap(corr_matrix, annot=True, cmap='viridis', fmt=\".2f\", linewidths=.5, cbar=True)\n",
"\n",
" # Add indicators for strength and direction of correlation\n",
" for i in range(len(corr_matrix)):\n",
" for j in range(len(corr_matrix.columns)):\n",
" if i != j:\n",
" if corr_matrix.iloc[i, j] >= 0.7:\n",
" plt.text(j + 0.5, i + 0.5, '\\u25B2', ha='center', va='center', color='white', fontsize=15)\n",
" elif corr_matrix.iloc[i, j] <= -0.7:\n",
" plt.text(j + 0.5, i + 0.5, '\\u25BC', ha='center', va='center', color='white', fontsize=15)\n",
"\n",
" # Set labels and title\n",
" plt.title(f'Correlation Heatmap ({method.capitalize()} Correlation)')\n",
" plt.xlabel('Features')\n",
" plt.ylabel('Features')\n",
"\n",
" # Adjust layout\n",
" plt.tight_layout()\n",
"\n",
" # Show plot\n",
" plt.show()\n",
"\n",
"# Perform one-hot encoding for categorical variables\n",
"df_train_encoded = pd.get_dummies(df_train)\n",
"\n",
"# Plot correlation heatmap for Pearson ,spearman and kendell correlation coefficient(in my case using kendell's tau)\n",
"plot_correlation_heatmap(df_train_encoded, method='pearson')\n",
"\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">c. Correlation heatmap for Kendall's tau correlation coefficient:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:48:46.926441Z\",\"iopub.execute_input\":\"2024-02-28T14:48:46.927340Z\",\"iopub.status.idle\":\"2024-02-28T14:48:53.537178Z\",\"shell.execute_reply.started\":\"2024-02-28T14:48:46.927306Z\",\"shell.execute_reply\":\"2024-02-28T14:48:53.535863Z\"}}\n",
"# Plot correlation heatmap for Kendall's tau correlation coefficient\n",
"plot_correlation_heatmap(df_train_encoded, method='kendall')\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">d. 3D Scatter Plot of Numerical Columns against Obesity Level:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:48:53.538918Z\",\"iopub.execute_input\":\"2024-02-28T14:48:53.539340Z\",\"iopub.status.idle\":\"2024-02-28T14:48:54.674481Z\",\"shell.execute_reply.started\":\"2024-02-28T14:48:53.539308Z\",\"shell.execute_reply\":\"2024-02-28T14:48:54.673235Z\"}}\n",
"# Define numerical columns for the plot\n",
"numerical_columns = ['Age', 'Height', 'Weight', 'High caleric food consp', 'veg consp', 'main meal consp',\n",
" 'Food btw meal consp', 'Water consp', 'Calories Monitoring', 'physical actv', 'Screentime',\n",
" 'Alcohol consp']\n",
"\n",
"# Selecting only the numerical columns and 'Obesity_Level' from the dataframe\n",
"df_numerical = df_train[numerical_columns + ['Obesity_Level']]\n",
"\n",
"# Define colors for different obesity levels\n",
"color_map = {'Insufficient_Weight': 'blue',\n",
" 'Normal_Weight': 'green',\n",
" 'Overweight_Level_I': 'orange',\n",
" 'Overweight_Level_II': 'red',\n",
" 'Obesity_Type_I': 'purple',\n",
" 'Obesity_Type_II': 'brown',\n",
" 'Obesity_Type_III': 'black'}\n",
"\n",
"# Create a 3D scatter plot\n",
"fig = plt.figure(figsize=(30,20))\n",
"ax = fig.add_subplot(111, projection='3d')\n",
"\n",
"# Plot each obesity level separately\n",
"for obesity_level, color in color_map.items():\n",
" df_obesity_level = df_numerical[df_numerical['Obesity_Level'] == obesity_level]\n",
" ax.scatter(df_obesity_level['Age'], df_obesity_level['Height'], df_obesity_level['Weight'], color=color, label=obesity_level)\n",
"\n",
"# Set labels and title\n",
"ax.set_xlabel('Age')\n",
"ax.set_ylabel('Height')\n",
"ax.set_zlabel('Weight')\n",
"ax.set_title('3D Scatter Plot of Numerical Columns against Obesity Level')\n",
"\n",
"# Show plot\n",
"plt.show()\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">e. Cluster Analysis:</span>\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">I. K-Means Clustering on Obesity level:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:48:54.675856Z\",\"iopub.execute_input\":\"2024-02-28T14:48:54.676250Z\",\"iopub.status.idle\":\"2024-02-28T14:49:08.048815Z\",\"shell.execute_reply.started\":\"2024-02-28T14:48:54.676217Z\",\"shell.execute_reply\":\"2024-02-28T14:49:08.047521Z\"}}\n",
"# Select numerical features for clustering\n",
"numerical_features = ['Age', 'Height', 'Weight', 'veg consp', 'main meal consp', 'Water consp', 'physical actv', 'Screentime']\n",
"\n",
"# Extract numerical features from the dataframe\n",
"X = df_train[numerical_features]\n",
"\n",
"# Standardize the features\n",
"scaler = StandardScaler()\n",
"X_scaled = scaler.fit_transform(X)\n",
"\n",
"# Initialize and fit KMeans model\n",
"kmeans = KMeans(n_clusters=3, random_state=42)\n",
"kmeans.fit(X_scaled)\n",
"\n",
"# Add cluster labels to the dataframe\n",
"df_train['Cluster'] = kmeans.labels_\n",
"\n",
"# Visualize the clusters (assuming 2D visualization)\n",
"sns.scatterplot(x='Height', y='Weight', hue='Cluster', data=df_train, palette='Set1')\n",
"plt.title('KMeans Clustering')\n",
"plt.show()\n",
"\n",
"# Analyze how clusters relate to obesity levels\n",
"cluster_obesity = df_train.groupby('Cluster')['Obesity_Level'].value_counts(normalize=True).unstack()\n",
"print(cluster_obesity)\n",
"\n",
"# %% [markdown]\n",
"# The output provides information on how the clusters relate to different obesity levels.\n",
"# Each row represents a cluster, and each column represents an obesity level.\n",
"# The values in the table represent the proportion of individuals within each cluster belonging to a specific obesity level.\n",
"#\n",
"# For example:\n",
"#\n",
"# - **Cluster 0**: Majority of individuals have obesity levels 0 and 1, with smaller proportions in other levels. Level 6 also has a notable proportion in this cluster.\n",
"# - **Cluster 1**: Significant proportion of individuals have obesity levels 3, 4, and 5, while levels 0 and 1 have much smaller proportions. Level 6 also has a notable proportion in this cluster.\n",
"# - **Cluster 2**: Relatively balanced distribution across various obesity levels, with no individuals in level 4 and a missing value in level 5. Level 6 has a considerable proportion in this cluster.\n",
"#\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">II. PCA Plot of numerical variables against obesity level:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:08.052719Z\",\"iopub.execute_input\":\"2024-02-28T14:49:08.053107Z\",\"iopub.status.idle\":\"2024-02-28T14:49:09.381417Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:08.053077Z\",\"shell.execute_reply\":\"2024-02-28T14:49:09.380212Z\"}}\n",
"# Assuming you have numerical columns in df_train\n",
"# Select numerical columns for PCA\n",
"numerical_columns = ['Age', 'Height', 'Weight', 'veg consp', 'main meal consp', 'Water consp', 'physical actv', 'Screentime']\n",
"\n",
"# Extract numerical data\n",
"X = df_train[numerical_columns]\n",
"\n",
"# Perform PCA\n",
"pca = PCA(n_components=2) # You can adjust the number of components\n",
"X_pca = pca.fit_transform(X)\n",
"\n",
"# Create a DataFrame for the PCA results\n",
"df_pca = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])\n",
"\n",
"# Add Obesity_Level to the PCA DataFrame for color differentiation\n",
"df_pca['Obesity_Level'] = df_train['Obesity_Level']\n",
"\n",
"# Visualize PCA\n",
"plt.figure(figsize=(10, 6))\n",
"sns.scatterplot(x='PC1', y='PC2', hue='Obesity_Level', data=df_pca, palette='Set1', legend='full')\n",
"plt.title('PCA Plot of numerical variables against obesity level')\n",
"plt.xlabel('Principal Component 1')\n",
"plt.ylabel('Principal Component 2')\n",
"plt.show()\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">4. Outlier Analysis:</span>\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">a. Univariate Outlier Analysis:</span>\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">I. Boxplot Outlier Analysis:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:09.382697Z\",\"iopub.execute_input\":\"2024-02-28T14:49:09.383048Z\",\"iopub.status.idle\":\"2024-02-28T14:49:10.937299Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:09.383020Z\",\"shell.execute_reply\":\"2024-02-28T14:49:10.936189Z\"}}\n",
"# Function to identify outliers using Box Plot\n",
"def box_plot_outliers(df, col):\n",
" \"\"\"\n",
" Detect outliers using Box Plot.\n",
"\n",
" Parameters:\n",
" df (DataFrame): The input DataFrame.\n",
" col (str): The name of the column to analyze.\n",
"\n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" plt.figure(figsize=(10, 6))\n",
" sns.boxplot(x=df[col])\n",
" plt.title(f'Box Plot of {col}')\n",
" plt.xlabel(f'{col}')\n",
" plt.show()\n",
"\n",
"# Selecting numerical columns\n",
"numerical_cols = df_train.select_dtypes(include=['float64', 'int32']).columns\n",
"\n",
"# Loop through each numerical column and perform outlier analysis\n",
"for col in numerical_cols:\n",
" print(f'Column: {col}')\n",
" box_plot_outliers(df_train, col)\n",
" print('\\n')\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">II. Detecting outliers using Z-Score:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:10.939135Z\",\"iopub.execute_input\":\"2024-02-28T14:49:10.939621Z\",\"iopub.status.idle\":\"2024-02-28T14:49:10.971129Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:10.939573Z\",\"shell.execute_reply\":\"2024-02-28T14:49:10.969761Z\"}}\n",
"# Function to identify outliers using Z-Score\n",
"def z_score_outliers(df, col, threshold=3):\n",
" \"\"\"\n",
" Detect outliers using Z-Score.\n",
"\n",
" Parameters:\n",
" df (DataFrame): The input DataFrame.\n",
" col (str): The name of the column to analyze.\n",
" threshold (float): The Z-Score threshold for outlier detection.\n",
"\n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" z_scores = (df[col] - df[col].mean()) / df[col].std()\n",
" outliers = df[abs(z_scores) > threshold]\n",
" print(f'Number of outliers detected using Z-Score for {col}: {outliers.shape[0]}')\n",
"\n",
"# Selecting numerical columns\n",
"numerical_cols = df_train.select_dtypes(include=['float64', 'int32']).columns\n",
"\n",
"# Loop through each numerical column and perform outlier analysis\n",
"for col in numerical_cols:\n",
" print(f'Column: {col}')\n",
" z_score_outliers(df_train, col)\n",
" print('\\n')\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">III. Detecting outliers using Interquartile Range (IQR):</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:10.972596Z\",\"iopub.execute_input\":\"2024-02-28T14:49:10.972980Z\",\"iopub.status.idle\":\"2024-02-28T14:49:11.016067Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:10.972938Z\",\"shell.execute_reply\":\"2024-02-28T14:49:11.014917Z\"}}\n",
"# Function to identify outliers using IQR\n",
"def iqr_outliers(df, col):\n",
" \"\"\"\n",
" Detect outliers using Interquartile Range (IQR).\n",
"\n",
" Parameters:\n",
" df (DataFrame): The input DataFrame.\n",
" col (str): The name of the column to analyze.\n",
"\n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" q1 = df[col].quantile(0.25)\n",
" q3 = df[col].quantile(0.75)\n",
" iqr = q3 - q1\n",
" lower_bound = q1 - 1.5 * iqr\n",
" upper_bound = q3 + 1.5 * iqr\n",
" outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]\n",
" print(f'Number of outliers detected using IQR for {col}: {outliers.shape[0]}')\n",
"\n",
"# Selecting numerical columns\n",
"numerical_cols = df_train.select_dtypes(include=['float64', 'int32']).columns\n",
"\n",
"# Loop through each numerical column and perform outlier analysis\n",
"for col in numerical_cols:\n",
" print(f'Column: {col}')\n",
" iqr_outliers(df_train, col)\n",
" print('\\n')\n",
"\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">b. Multivariate Outlier Analysis:</span>\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">I. Detecting Multivariate Outliers Using Mahalanobis Distance:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:11.017627Z\",\"iopub.execute_input\":\"2024-02-28T14:49:11.018005Z\",\"iopub.status.idle\":\"2024-02-28T14:49:17.597145Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:11.017970Z\",\"shell.execute_reply\":\"2024-02-28T14:49:17.595939Z\"}}\n",
"# Function to calculate Mahalanobis Distance\n",
"def mahalanobis_distance(x, mean, cov):\n",
" \"\"\"\n",
" Calculate Mahalanobis Distance for a data point.\n",
"\n",
" Parameters:\n",
" x (array-like): The data point.\n",
" mean (array-like): The mean vector.\n",
" cov (array-like): The covariance matrix.\n",
"\n",
" Returns:\n",
" float: The Mahalanobis Distance.\n",
" \"\"\"\n",
" x_minus_mean = x - mean\n",
" inv_cov = np.linalg.inv(cov)\n",
" distance = np.sqrt(np.dot(np.dot(x_minus_mean, inv_cov), x_minus_mean.T))\n",
" return distance\n",
"\n",
"# Function to detect multivariate outliers using Mahalanobis Distance\n",
"def mahalanobis_outliers(df, threshold=3):\n",
" \"\"\"\n",
" Detect multivariate outliers using Mahalanobis Distance.\n",
"\n",
" Parameters:\n",
" df (DataFrame): The input DataFrame.\n",
" threshold (float): The Mahalanobis Distance threshold for outlier detection.\n",
"\n",
" Returns:\n",
" DataFrame: The DataFrame containing outliers.\n",
" \"\"\"\n",
" mean = df.mean()\n",
" cov = df.cov()\n",
" outliers = []\n",
" for i, row in df.iterrows():\n",
" distance = mahalanobis_distance(row, mean, cov)\n",
" if distance > threshold:\n",
" outliers.append(i)\n",
" return df.iloc[outliers]\n",
"\n",
"# Selecting numerical columns\n",
"numerical_cols = df_train.select_dtypes(include=['float64', 'int32']).columns\n",
"\n",
"# Performing multivariate outlier analysis using Mahalanobis Distance\n",
"mahalanobis_outliers_df = mahalanobis_outliers(df_train[numerical_cols])\n",
"mahalanobis_outliers_cols = mahalanobis_outliers_df.columns.tolist()\n",
"\n",
"\n",
"print(f'Number of multivariate outliers detected using Mahalanobis Distance: {mahalanobis_outliers_df.shape[0]}')\n",
"print('Columns with outliers detected using Mahalanobis Distance:', mahalanobis_outliers_cols)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">II. Detecting Multivariate Outliers Using Principal Component Analysis (PCA):</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:17.598621Z\",\"iopub.execute_input\":\"2024-02-28T14:49:17.599057Z\",\"iopub.status.idle\":\"2024-02-28T14:49:17.673248Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:17.599017Z\",\"shell.execute_reply\":\"2024-02-28T14:49:17.671510Z\"}}\n",
"# Function to detect multivariate outliers using Principal Component Analysis (PCA)\n",
"def pca_outliers(df, threshold=3):\n",
" \"\"\"\n",
" Detect multivariate outliers using Principal Component Analysis (PCA).\n",
"\n",
" Parameters:\n",
" df (DataFrame): The input DataFrame.\n",
" threshold (float): The threshold for outlier detection based on PCA distance.\n",
"\n",
" Returns:\n",
" DataFrame: The DataFrame containing outliers.\n",
" \"\"\"\n",
" pca = PCA(n_components=2)\n",
" principal_components = pca.fit_transform(df)\n",
" distances = np.linalg.norm(principal_components - np.mean(principal_components, axis=0), axis=1)\n",
" cutoff = np.percentile(distances, 100 - 100 * chi2.cdf(threshold, 2))\n",
" outliers = df[distances > cutoff]\n",
" return outliers\n",
"\n",
"# Selecting numerical columns\n",
"numerical_cols = df_train.select_dtypes(include=['float64', 'int32']).columns\n",
"\n",
"# Performing multivariate outlier analysis using Principal Component Analysis (PCA)\n",
"pca_outliers_df = pca_outliers(df_train[numerical_cols])\n",
"pca_outliers_cols = pca_outliers_df.columns.tolist()\n",
"\n",
"\n",
"print(f'Number of multivariate outliers detected using PCA: {pca_outliers_df.shape[0]}')\n",
"print('Columns with outliers detected using PCA:', pca_outliers_cols)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">III. Detecting Cluster-Based Outliers Using KMeans Clustering:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:17.675001Z\",\"iopub.execute_input\":\"2024-02-28T14:49:17.675555Z\",\"iopub.status.idle\":\"2024-02-28T14:49:27.760460Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:17.675515Z\",\"shell.execute_reply\":\"2024-02-28T14:49:27.758841Z\"}}\n",
"# Select numerical columns for clustering\n",
"numerical_cols = df_train.select_dtypes(include=['float64', 'int32'])\n",
"\n",
"# Initialize KMeans with the desired number of clusters\n",
"kmeans = KMeans(n_clusters=5) # Adjust the number of clusters as needed\n",
"\n",
"# Fit KMeans to the numerical data\n",
"kmeans.fit(numerical_cols)\n",
"\n",
"# Get the cluster centroids\n",
"cluster_centers = kmeans.cluster_centers_\n",
"\n",
"# Calculate the distance of each point to its cluster centroid\n",
"distances = []\n",
"for i in range(len(df_train)):\n",
" point = np.array(df_train.iloc[i][numerical_cols.columns])\n",
" cluster_label = kmeans.labels_[i]\n",
" centroid = cluster_centers[cluster_label]\n",
" distance = np.linalg.norm(point - centroid)\n",
" distances.append(distance)\n",
"\n",
"# Set a threshold to identify outliers\n",
"threshold = np.percentile(distances, 95) # Adjust the percentile as needed\n",
"\n",
"# Identify outliers based on the threshold\n",
"outliers_indices = [i for i, distance in enumerate(distances) if distance > threshold]\n",
"outliers = df_train.iloc[outliers_indices]\n",
"\n",
"# Filter out categorical columns before calculating the sum of outliers\n",
"numerical_outliers = outliers.select_dtypes(include=['float64', 'int32'])\n",
"\n",
"# Calculate the sum of all outliers present in each numerical column\n",
"outliers_sum_per_column = numerical_outliers.sum()\n",
"\n",
"# Calculate the total sum of outliers across all numerical columns\n",
"total_outliers_sum = numerical_outliers.sum().sum()\n",
"\n",
"# Display the sum of outliers for each numerical column\n",
"print(\"\\nSum of outliers present in each numerical column:\")\n",
"print(outliers_sum_per_column)\n",
"\n",
"# Display the total sum of outliers across all numerical columns\n",
"print(\"\\nTotal sum of outliers across all numerical columns:\", total_outliers_sum)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:27.761699Z\",\"iopub.execute_input\":\"2024-02-28T14:49:27.762035Z\",\"iopub.status.idle\":\"2024-02-28T14:49:27.771778Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:27.762008Z\",\"shell.execute_reply\":\"2024-02-28T14:49:27.770199Z\"}}\n",
"df_train.drop(columns=['Cluster'], inplace=True)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">5. Feature Engineering:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:27.773166Z\",\"iopub.execute_input\":\"2024-02-28T14:49:27.773477Z\",\"iopub.status.idle\":\"2024-02-28T14:49:27.783874Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:27.773451Z\",\"shell.execute_reply\":\"2024-02-28T14:49:27.782671Z\"}}\n",
"# Rename the columns for train data\n",
"test_sub.rename(columns=new_column_names, inplace=True)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:27.785329Z\",\"iopub.execute_input\":\"2024-02-28T14:49:27.785669Z\",\"iopub.status.idle\":\"2024-02-28T14:49:27.811802Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:27.785641Z\",\"shell.execute_reply\":\"2024-02-28T14:49:27.810358Z\"}}\n",
"test_sub.head(5)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">a. Encoding Categorical to numerical variables:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:27.813297Z\",\"iopub.execute_input\":\"2024-02-28T14:49:27.813978Z\",\"iopub.status.idle\":\"2024-02-28T14:49:29.518678Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:27.813941Z\",\"shell.execute_reply\":\"2024-02-28T14:49:29.517425Z\"}}\n",
"# Encoding of target variables to numerical\n",
"keys_dict = {\n",
" 'Insufficient_Weight': 0,\n",
" 'Normal_Weight': 1,\n",
" 'Overweight_Level_I': 2,\n",
" 'Overweight_Level_II': 3,\n",
" 'Obesity_Type_I': 4,\n",
" 'Obesity_Type_II': 5,\n",
" 'Obesity_Type_III': 6\n",
"}\n",
"\n",
"# Encoding of transport used to numerical\n",
"keys_dict_1 = {\n",
" 'Automobile': 0,\n",
" 'Bike': 1,\n",
" 'Motorbike': 2,\n",
" 'Public_Transportation': 3,\n",
" 'Walking': 4\n",
"}\n",
"\n",
"# Encoding of Alcohol consumption to numerical\n",
"keys_dict_2 = {\n",
" 'Sometimes': 1/3,\n",
" 'Frequently': 2/3,\n",
" 'Always': 1,\n",
" 'no': 0\n",
"}\n",
"\n",
"# Encoding of Food between meal consumption to numerical\n",
"keys_dict_3 = {\n",
" 'Sometimes': 1/3,\n",
" 'Frequently': 2/3,\n",
" 'Always': 1,\n",
" 'no': 0\n",
"}\n",
"\n",
"def encode_obesity_level(row):\n",
" return keys_dict.get(row['Obesity_Level'], None)\n",
"\n",
"def encode_transport_used(row):\n",
" return keys_dict_1.get(row['transport used'], None)\n",
"\n",
"def encode_alcohol_consp(row):\n",
" return keys_dict_2.get(row['Alcohol consp'], None)\n",
"\n",
"def encode_food_btw_meal(row):\n",
" return keys_dict_3.get(row['Food btw meal consp'], None)\n",
"\n",
"# Add new columns and apply encoding for train data\n",
"df_train['Encdd_Obesity_Level'] = df_train.apply(encode_obesity_level, axis=1)\n",
"df_train['Encdd_transport_used'] = df_train.apply(encode_transport_used, axis=1)\n",
"df_train['Encdd_Alcohol_consp'] = df_train.apply(encode_alcohol_consp, axis=1)\n",
"df_train['Encdd_Food_btw_meal'] = df_train.apply(encode_food_btw_meal, axis=1)\n",
"\n",
"\n",
"# Add new columns and apply encoding for test data\n",
"test_sub['Encdd_transport_used'] = test_sub.apply(encode_transport_used, axis=1)\n",
"test_sub['Encdd_Alcohol_consp'] = test_sub.apply(encode_alcohol_consp, axis=1)\n",
"test_sub['Encdd_Food_btw_meal'] = test_sub.apply(encode_food_btw_meal, axis=1)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:29.520304Z\",\"iopub.execute_input\":\"2024-02-28T14:49:29.520703Z\",\"iopub.status.idle\":\"2024-02-28T14:49:29.548376Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:29.520673Z\",\"shell.execute_reply\":\"2024-02-28T14:49:29.547009Z\"}}\n",
"df_train.head(5)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:29.552391Z\",\"iopub.execute_input\":\"2024-02-28T14:49:29.552741Z\",\"iopub.status.idle\":\"2024-02-28T14:49:29.582056Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:29.552713Z\",\"shell.execute_reply\":\"2024-02-28T14:49:29.580892Z\"}}\n",
"test_sub.head(5)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:29.586310Z\",\"iopub.execute_input\":\"2024-02-28T14:49:29.586690Z\",\"iopub.status.idle\":\"2024-02-28T14:49:31.975124Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:29.586652Z\",\"shell.execute_reply\":\"2024-02-28T14:49:31.974000Z\"}}\n",
"# Define mappings for each column\n",
"gender_mapping = {'Male': 1, 'Female': 0}\n",
"family_history_mapping = {'yes': 1, 'no': 0}\n",
"high_caloric_mapping = {'yes': 1, 'no': 0}\n",
"smoke_mapping = {'yes': 1, 'no': 0}\n",
"calories_monitoring_mapping = {'yes': 1, 'no': 0}\n",
"\n",
"# Define functions to apply mappings and create new encoded columns\n",
"def encode_gender(row):\n",
" return gender_mapping.get(row['Gender'], None)\n",
"\n",
"def encode_family_history(row):\n",
" return family_history_mapping.get(row['Overweighted Family History'], None)\n",
"\n",
"def encode_high_caloric(row):\n",
" return high_caloric_mapping.get(row['High caleric food consp'], None)\n",
"\n",
"def encode_smoke(row):\n",
" return smoke_mapping.get(row['SMOKE'], None)\n",
"\n",
"def encode_calories_monitoring(row):\n",
" return calories_monitoring_mapping.get(row['Calories Monitoring'], None)\n",
"\n",
"# Apply functions to create new encoded columns for train data\n",
"df_train['Encoded_Gender'] = df_train.apply(encode_gender, axis=1)\n",
"df_train['Encoded_Family_History'] = df_train.apply(encode_family_history, axis=1)\n",
"df_train['Encoded_High_Caloric'] = df_train.apply(encode_high_caloric, axis=1)\n",
"df_train['Encoded_Smoke'] = df_train.apply(encode_smoke, axis=1)\n",
"df_train['Encoded_Calories_Monitoring'] = df_train.apply(encode_calories_monitoring, axis=1)\n",
"\n",
"# Apply functions to create new encoded columns for train data\n",
"test_sub['Encoded_Gender'] = test_sub.apply(encode_gender, axis=1)\n",
"test_sub['Encoded_Family_History'] = test_sub.apply(encode_family_history, axis=1)\n",
"test_sub['Encoded_High_Caloric'] = test_sub.apply(encode_high_caloric, axis=1)\n",
"test_sub['Encoded_Smoke'] = test_sub.apply(encode_smoke, axis=1)\n",
"test_sub['Encoded_Calories_Monitoring'] = test_sub.apply(encode_calories_monitoring, axis=1)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">b. BMI(Body Mass Index) Calculation:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:31.976576Z\",\"iopub.execute_input\":\"2024-02-28T14:49:31.976957Z\",\"iopub.status.idle\":\"2024-02-28T14:49:31.985653Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:31.976912Z\",\"shell.execute_reply\":\"2024-02-28T14:49:31.984392Z\"}}\n",
"#Calculation of BMI(Body Mass Index), Veg Intake comapred to high calorie food consp, Total number of meal consp and Physical activity frequency\n",
"\n",
"# Create new columns based on existing ones\n",
"df_train['BMI'] = df_train['Weight'] / (df_train['Height'] ** 2)\n",
"test_sub['BMI'] = test_sub['Weight'] / (test_sub['Height'] ** 2)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">c. Total Meal Consumed:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:31.987148Z\",\"iopub.execute_input\":\"2024-02-28T14:49:31.987515Z\",\"iopub.status.idle\":\"2024-02-28T14:49:31.998667Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:31.987485Z\",\"shell.execute_reply\":\"2024-02-28T14:49:31.997563Z\"}}\n",
"# Calculate the total number of meals consumed\n",
"# This is done by adding the counts of main meals and between-meal snacks\n",
"df_train['Meal'] = df_train['main meal consp'] + df_train['Encdd_Food_btw_meal']\n",
"test_sub['Meal'] = test_sub['main meal consp'] + test_sub['Encdd_Food_btw_meal']\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">d. Total Activity Frequency Calculation:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:32.000175Z\",\"iopub.execute_input\":\"2024-02-28T14:49:32.000742Z\",\"iopub.status.idle\":\"2024-02-28T14:49:32.015180Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:32.000708Z\",\"shell.execute_reply\":\"2024-02-28T14:49:32.013988Z\"}}\n",
"# Calculate the product of physical activity frequency and screen time\n",
"df_train['Activity'] = df_train['physical actv'] * df_train['Screentime']\n",
"test_sub['Activity'] = test_sub['physical actv'] * test_sub['Screentime']\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">e. Ageing process analysis:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:32.019625Z\",\"iopub.execute_input\":\"2024-02-28T14:49:32.020014Z\",\"iopub.status.idle\":\"2024-02-28T14:49:32.060289Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:32.019983Z\",\"shell.execute_reply\":\"2024-02-28T14:49:32.059027Z\"}}\n",
"df_train['IsYoung'] = df_train['Age'].apply(lambda x: x < 25)\n",
"df_train['IsAging'] = df_train['Age'].apply(lambda x: 25 <= x < 40)\n",
"\n",
"test_sub['IsYoung'] = test_sub['Age'].apply(lambda x: x < 25)\n",
"test_sub['IsAging'] = test_sub['Age'].apply(lambda x: 25 <= x < 40)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:32.062132Z\",\"iopub.execute_input\":\"2024-02-28T14:49:32.062613Z\",\"iopub.status.idle\":\"2024-02-28T14:49:32.096288Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:32.062566Z\",\"shell.execute_reply\":\"2024-02-28T14:49:32.095172Z\"}}\n",
"df_train.head(5)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:32.098238Z\",\"iopub.execute_input\":\"2024-02-28T14:49:32.098707Z\",\"iopub.status.idle\":\"2024-02-28T14:49:32.130644Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:32.098655Z\",\"shell.execute_reply\":\"2024-02-28T14:49:32.129531Z\"}}\n",
"test_sub.head(5)\n",
"\n",
"# %% [markdown]\n",
"# # Section: 6. Analysis & Prediction Using Machine Learning(ML) Model:\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">1. Feature Importance Analysis and Visualization:</span>\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">a. Feature Importance Analysis using Random Forest Classifier:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:32.132165Z\",\"iopub.execute_input\":\"2024-02-28T14:49:32.132540Z\",\"iopub.status.idle\":\"2024-02-28T14:49:35.825577Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:32.132510Z\",\"shell.execute_reply\":\"2024-02-28T14:49:35.824492Z\"}}\n",
"# Assuming df_train contains your dataset\n",
"# Define X (features) and y (target variable)\n",
"X = df_train.drop(columns=['Obesity_Level'])\n",
"y = df_train['Obesity_Level']\n",
"\n",
"# Perform one-hot encoding for categorical variables\n",
"X_encoded = pd.get_dummies(X)\n",
"\n",
"# Initialize the model\n",
"model = RandomForestClassifier()\n",
"\n",
"# Train the model\n",
"model.fit(X_encoded, y)\n",
"\n",
"# Get feature importances\n",
"feature_importances = model.feature_importances_\n",
"\n",
"# Sort feature importances and corresponding feature names\n",
"sorted_indices = feature_importances.argsort()[::-1]\n",
"sorted_feature_importances = feature_importances[sorted_indices]\n",
"sorted_feature_names = X_encoded.columns[sorted_indices]\n",
"\n",
"# Limit the number of displayed features\n",
"top_n = 20\n",
"sorted_feature_importances = sorted_feature_importances[:top_n]\n",
"sorted_feature_names = sorted_feature_names[:top_n]\n",
"# Calculate mean and standard deviation of feature importances\n",
"mean_importance = np.mean(sorted_feature_importances)\n",
"std_importance = np.std(sorted_feature_importances)\n",
"\n",
"# Calculate coefficient of variation (CV)\n",
"cv_importance = std_importance / mean_importance\n",
"\n",
"# Visualize feature importances\n",
"plt.figure(figsize=(28, 6))\n",
"plt.bar(sorted_feature_names, sorted_feature_importances, color='skyblue')\n",
"plt.xlabel('Features')\n",
"plt.ylabel('Importance')\n",
"plt.title('Top {} Feature Importance Analysis'.format(top_n))\n",
"plt.xticks(rotation=45, ha='right')\n",
"for i, v in enumerate(sorted_feature_importances):\n",
" plt.text(i, v + 0.01, str(round(v, 3)), ha='center', va='bottom')\n",
"plt.axhline(y=mean_importance, color='r', linestyle='--', label='Mean Importance')\n",
"plt.axhline(y=mean_importance + std_importance, color='g', linestyle='--', label='Mean + 1 Std Dev')\n",
"plt.axhline(y=mean_importance - std_importance, color='g', linestyle='--', label='Mean - 1 Std Dev')\n",
"plt.legend()\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"\n",
"\n",
"# Define the statistical terms\n",
"statistical_terms = [\n",
" [\"Mean Importance\", round(mean_importance, 3)],\n",
" [\"Standard Deviation of Importance\", round(std_importance, 3)],\n",
" [\"Coefficient of Variation (CV) of Importance\", round(cv_importance, 3)]\n",
"]\n",
"\n",
"# Print the statistical terms in a table-like structure\n",
"print(tabulate(statistical_terms, headers=[\"Statistical Term\", \"Value\"]))\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">b. Feature Importance Analysis using XGBoost(XGB) Model:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:35.827226Z\",\"iopub.execute_input\":\"2024-02-28T14:49:35.827565Z\",\"iopub.status.idle\":\"2024-02-28T14:49:40.607898Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:35.827536Z\",\"shell.execute_reply\":\"2024-02-28T14:49:40.606772Z\"}}\n",
"from sklearn.preprocessing import LabelEncoder # For encoding categorical variables\n",
"# Assuming df_train contains your dataset\n",
"# Define X (features) and y (target variable)\n",
"X = df_train.drop(columns=['Obesity_Level'])\n",
"y = df_train['Obesity_Level']\n",
"\n",
"# Encode target variable into numerical labels\n",
"encoder = LabelEncoder()\n",
"y_encoded = encoder.fit_transform(y)\n",
"\n",
"# Encode categorical features\n",
"encoder = LabelEncoder()\n",
"X_encoded = X.copy()\n",
"for col in X_encoded.columns:\n",
" if X_encoded[col].dtype == 'object':\n",
" X_encoded[col] = encoder.fit_transform(X_encoded[col])\n",
"\n",
"# Initialize the XGBoost classifier\n",
"model_xgb = xgb.XGBClassifier()\n",
"\n",
"# Train the model\n",
"model_xgb.fit(X_encoded, y_encoded)\n",
"\n",
"# Get feature importances\n",
"feature_importances_xgb = model_xgb.feature_importances_\n",
"\n",
"# Calculate statistical information\n",
"mean_importance = np.mean(feature_importances_xgb)\n",
"std_importance = np.std(feature_importances_xgb)\n",
"max_importance = np.max(feature_importances_xgb)\n",
"importance_range = max_importance - np.min(feature_importances_xgb)\n",
"\n",
"# Count the occurrences of each feature\n",
"feature_counts = X_encoded.apply(lambda x: x.value_counts()).fillna(0).astype(int)\n",
"\n",
"# Visualize feature importances\n",
"plt.figure(figsize=(20, 9)) # Increase figure size\n",
"\n",
"# Define color palette\n",
"colors = plt.cm.viridis(np.linspace(0, 1, len(X_encoded.columns)))\n",
"\n",
"bars = plt.bar(X_encoded.columns, feature_importances_xgb, color=colors) # Change color\n",
"plt.xlabel('Features', fontsize=14) # Increase font size\n",
"plt.ylabel('Importance', fontsize=14) # Increase font size\n",
"plt.title('Feature Importance Analysis (XGBoost)', fontsize=16) # Increase font size\n",
"plt.xticks(rotation=45, fontsize=12) # Rotate x-axis labels and increase font size\n",
"plt.yticks(fontsize=12) # Increase font size for y-axis ticks\n",
"plt.grid(axis='y', linestyle='--', alpha=0.7) # Add grid lines for better readability\n",
"\n",
"# Add statistical information\n",
"plt.axhline(mean_importance, color='red', linestyle='--', label=f'Mean Importance: {mean_importance:.2f}')\n",
"plt.axhline(mean_importance + std_importance, color='green', linestyle='--', label=f'Std Dev Above Mean: {std_importance:.2f}')\n",
"plt.axhline(mean_importance - std_importance, color='green', linestyle='--', label=f'Std Dev Below Mean: {std_importance:.2f}')\n",
"plt.axhline(max_importance, color='orange', linestyle='--', label=f'Max Importance: {max_importance:.2f}')\n",
"plt.axhline(np.min(feature_importances_xgb), color='purple', linestyle='--', label=f'Min Importance: {np.min(feature_importances_xgb):.2f}')\n",
"plt.text(len(X_encoded.columns)-0.5, max_importance + 0.005, f'Importance Range: {importance_range:.2f}', ha='center', va='bottom', fontsize=12, color='black')\n",
"\n",
"# Add feature importance values above each bar\n",
"for i, importance in enumerate(feature_importances_xgb):\n",
" plt.text(i, importance + 0.005, f'{importance:.2f}', ha='center', va='bottom', fontsize=10, color='black')\n",
"\n",
"plt.legend()\n",
"\n",
"plt.tight_layout() # Adjust layout to prevent overlapping labels\n",
"plt.show()\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">c. Feature Importance Analysis Using (LightGBM) Classifier Model:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:40.609779Z\",\"iopub.execute_input\":\"2024-02-28T14:49:40.610139Z\",\"iopub.status.idle\":\"2024-02-28T14:49:45.062510Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:40.610109Z\",\"shell.execute_reply\":\"2024-02-28T14:49:45.061193Z\"}}\n",
"# Assuming df_train contains your dataset\n",
"# Define X (features) and y (target variable)\n",
"X = df_train.drop(columns=['Obesity_Level'])\n",
"y = df_train['Obesity_Level']\n",
"\n",
"# Encode categorical features\n",
"encoder = LabelEncoder()\n",
"X_encoded = X.copy()\n",
"for col in X_encoded.columns:\n",
" if X_encoded[col].dtype == 'object':\n",
" X_encoded[col] = encoder.fit_transform(X_encoded[col])\n",
"\n",
"# Initialize the LightGBM classifier\n",
"model_lgb = lgb.LGBMClassifier(verbosity=-1)\n",
"\n",
"# Train the model\n",
"model_lgb.fit(X_encoded, y)\n",
"\n",
"# Get feature importances\n",
"feature_importances_lgb = model_lgb.feature_importances_\n",
"\n",
"# Create a color palette\n",
"colors = sns.color_palette(\"coolwarm\", len(X_encoded.columns))\n",
"\n",
"# Visualize feature importances\n",
"plt.figure(figsize=(20, 10)) # Increase figure size\n",
"bars = plt.bar(X_encoded.columns, feature_importances_lgb, color=colors) # Use color palette\n",
"plt.xlabel('Features', fontsize=14) # Increase font size\n",
"plt.ylabel('Importance', fontsize=14) # Increase font size\n",
"plt.title('Feature Importance Analysis (LightGBM)', fontsize=16) # Increase font size\n",
"plt.xticks(rotation=45, fontsize=12) # Rotate x-axis labels and increase font size\n",
"plt.yticks(fontsize=12) # Increase font size for y-axis ticks\n",
"plt.grid(axis='y', linestyle='--', alpha=0.7) # Add grid lines for better readability\n",
"\n",
"# Add statistical information\n",
"mean_importance = np.mean(feature_importances_lgb)\n",
"std_importance = np.std(feature_importances_lgb)\n",
"plt.axhline(mean_importance, color='black', linestyle='--', linewidth=1, label='Mean') # Add mean line\n",
"plt.axhline(mean_importance + std_importance, color='red', linestyle='--', linewidth=1, label='Mean + Std Dev') # Add mean + std dev line\n",
"plt.axhline(mean_importance - std_importance, color='blue', linestyle='--', linewidth=1, label='Mean - Std Dev') # Add mean - std dev line\n",
"plt.legend() # Show legend\n",
"\n",
"for bar, importance in zip(bars, feature_importances_lgb):\n",
" plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.005,\n",
" f'{importance:.2f}', ha='center', va='bottom', fontsize=10, color='black')\n",
"\n",
"plt.tight_layout() # Adjust layout to prevent overlapping labels\n",
"plt.show()\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">2. Data visualization after Feature Engineering:</span>\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">a. Bar plot of numerical variables:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:45.063863Z\",\"iopub.execute_input\":\"2024-02-28T14:49:45.064789Z\",\"iopub.status.idle\":\"2024-02-28T14:49:49.070935Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:45.064753Z\",\"shell.execute_reply\":\"2024-02-28T14:49:49.069796Z\"}}\n",
"# Define columns to plot (excluding non-numeric columns)\n",
"columns_to_plot = df_train.select_dtypes(include=['number']).columns\n",
"\n",
"# Plotting\n",
"plt.figure(figsize=(15, 10))\n",
"for i, col in enumerate(columns_to_plot, 1):\n",
" plt.subplot(6, 5, i)\n",
" df_train[col].hist()\n",
" plt.title(col)\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">b. PairPlot of Numerical Variables:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:49:49.072745Z\",\"iopub.execute_input\":\"2024-02-28T14:49:49.073332Z\",\"iopub.status.idle\":\"2024-02-28T14:54:07.787222Z\",\"shell.execute_reply.started\":\"2024-02-28T14:49:49.073300Z\",\"shell.execute_reply\":\"2024-02-28T14:54:07.784973Z\"}}\n",
"# Select numeric columns\n",
"numeric_columns = df_train.select_dtypes(include='number').columns\n",
"\n",
"# Set style and context\n",
"sns.set(style=\"whitegrid\", context=\"paper\")\n",
"\n",
"# Plot pairplot\n",
"pairplot = sns.pairplot(df_train[numeric_columns], markers='o', diag_kind='kde',\n",
" plot_kws={'alpha': 0.9, 's': 80, 'edgecolor': 'w'})\n",
"\n",
"# Customize labels and title\n",
"pairplot.fig.suptitle('Pairplot of Numeric Features', y=1.02, fontsize=16, fontweight='bold')\n",
"plt.subplots_adjust(top=0.92)\n",
"\n",
"\n",
"plt.show()\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">c. Correlation Heatmap of Numerical Variables:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:54:07.789467Z\",\"iopub.execute_input\":\"2024-02-28T14:54:07.790113Z\",\"iopub.status.idle\":\"2024-02-28T14:54:09.962296Z\",\"shell.execute_reply.started\":\"2024-02-28T14:54:07.790053Z\",\"shell.execute_reply\":\"2024-02-28T14:54:09.961167Z\"}}\n",
"# Assuming df_train contains your dataset\n",
"# Select numeric columns\n",
"numeric_columns = df_train.select_dtypes(include='number')\n",
"\n",
"# Calculate the correlation matrix\n",
"correlation_matrix = numeric_columns.corr()\n",
"\n",
"# Define thresholds for highlighting correlations\n",
"strong_positive_threshold = 0.7\n",
"strong_negative_threshold = -0.5\n",
"\n",
"# Plot the correlation heatmap\n",
"plt.figure(figsize=(20, 7))\n",
"sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=\".2f\", linewidths=0.5)\n",
"\n",
"# Add indicators for strong positive correlations\n",
"for i in range(len(correlation_matrix.columns)):\n",
" for j in range(len(correlation_matrix.columns)):\n",
" if i != j and abs(correlation_matrix.iloc[i, j]) >= strong_positive_threshold:\n",
" plt.text(j + 0.5, i + 0.5, '\\u25B2', ha='center', va='center', color='red', fontsize=14)\n",
"\n",
"# Add indicators for strong negative correlations\n",
"for i in range(len(correlation_matrix.columns)):\n",
" for j in range(len(correlation_matrix.columns)):\n",
" if i != j and correlation_matrix.iloc[i, j] <= strong_negative_threshold:\n",
" plt.text(j + 0.5, i + 0.5, '\\u25BC', ha='center', va='center', color='blue', fontsize=14)\n",
"\n",
"plt.title('Correlation Heatmap')\n",
"\n",
"plt.show()\n",
"\n",
"# %% [markdown]\n",
"# # Section: 7. Prediction of Obesity Risk Level Using Machine learning(ML) Models:\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">1. Machine Learning Model Creation: XGBoost and LightGBM and CatBoostClassifier - Powering The Predictions! 🚀</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:54:09.963834Z\",\"iopub.execute_input\":\"2024-02-28T14:54:09.964723Z\",\"iopub.status.idle\":\"2024-02-28T14:54:47.753085Z\",\"shell.execute_reply.started\":\"2024-02-28T14:54:09.964675Z\",\"shell.execute_reply\":\"2024-02-28T14:54:47.751915Z\"}}\n",
"# Your dataframe operations...\n",
"X = df_train.drop(['Obesity_Level','Encdd_Obesity_Level'], axis=1)\n",
"y = df_train['Obesity_Level']\n",
"\n",
"# Encode target variable into numerical labels\n",
"encoder = LabelEncoder()\n",
"y_encoded = encoder.fit_transform(y)\n",
"\n",
"# Encode categorical features\n",
"X_encoded = X.copy()\n",
"for col in X_encoded.columns:\n",
" if X_encoded[col].dtype == 'object':\n",
" encoder = LabelEncoder()\n",
" X_encoded[col] = encoder.fit_transform(X_encoded[col])\n",
"\n",
"# Train-test split\n",
"X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)\n",
"\n",
"# XGBClassifier Model\n",
"xgb_model = XGBClassifier(\n",
" subsample=0.6,\n",
" reg_lambda=0.5,\n",
" reg_alpha=2,\n",
" n_estimators=1500,\n",
" min_child_weight=1,\n",
" max_depth=7,\n",
" learning_rate=0.1,\n",
" gamma=1,\n",
" colsample_bytree=0.6,\n",
" random_state=42,\n",
" enable_categorical=True # Enable categorical support\n",
")\n",
"xgb_model.fit(X_train, y_train)\n",
"\n",
"# Generate predictions\n",
"xgb_predictions = xgb_model.predict_proba(X_test)\n",
"\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:54:47.754400Z\",\"iopub.execute_input\":\"2024-02-28T14:54:47.754721Z\",\"iopub.status.idle\":\"2024-02-28T14:55:27.797295Z\",\"shell.execute_reply.started\":\"2024-02-28T14:54:47.754695Z\",\"shell.execute_reply\":\"2024-02-28T14:55:27.796052Z\"}}\n",
"# LGBMClassifier Model\n",
"lgbm_model = LGBMClassifier(\n",
" objective=\"multiclass\",\n",
" metric=\"multi_logloss\",\n",
" verbosity=-1,\n",
" boosting_type=\"gbdt\",\n",
" random_state=42,\n",
" num_class=7,\n",
" learning_rate=0.030962211546832760,\n",
" n_estimators=500,\n",
" lambda_l1=0.009667446568254372,\n",
" lambda_l2=0.04018641437301800,\n",
" max_depth=10,\n",
" colsample_bytree=0.40977129346872643,\n",
" subsample=0.9535797422450176,\n",
" min_child_samples=26\n",
")\n",
"lgbm_model.fit(X_train, y_train)\n",
"\n",
"# CatBoostClassifier Model\n",
"catboost_model = CatBoostClassifier(\n",
" iterations=1000,\n",
" learning_rate=0.03,\n",
" depth=6,\n",
" random_seed=42,\n",
" loss_function='MultiClass',\n",
" eval_metric='Accuracy',\n",
" verbose=False\n",
")\n",
"catboost_model.fit(X_train, y_train, verbose=False)\n",
"\n",
"# Generate predictions for XGBoost model\n",
"xgb_predictions_proba = xgb_model.predict_proba(X_test)\n",
"\n",
"# Generate predictions for LightGBM model\n",
"lgbm_predictions_proba = lgbm_model.predict_proba(X_test)\n",
"\n",
"# Generate predictions for CatBoost model\n",
"catboost_predictions_proba = catboost_model.predict_proba(X_test)\n",
"\n",
"# Taking Average\n",
"average_predictions = (xgb_predictions_proba + lgbm_predictions_proba + catboost_predictions_proba) / 3\n",
"final_predictions = np.argmax(average_predictions, axis=1)\n",
"\n",
"accuracy = accuracy_score(y_test, final_predictions)\n",
"print(f\"Ensemble Model Accuracy: {accuracy:.4f}\")\n",
"\n",
"# %% [markdown]\n",
"# The reported accuracy of the ensemble model, denoted as `Ensemble Model Accuracy: 0.9080`, signifies a perfect match between the model's predictions and the actual labels in the test dataset. Achieving an accuracy of 0.9080, or 90.80%, suggests that the ensemble model performs flawlessly on the given task.\n",
"#\n",
"# However, such high accuracy warrants cautious interpretation. While it may indicate strong predictive performance, it also raises concerns about potential overfitting or data leakage. It's essential to verify the model's performance on unseen data to ensure its generalization capability.\n",
"#\n",
"# If this reported accuracy is obtained on a separate test dataset, it indicates that the ensemble model excels in accurately predicting the target variable. Nonetheless, continuous monitoring and validation of the model's performance are imperative to maintain its effectiveness in real-world applications.\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">2. Cutting-edge Machine Learning Model Evaluation: XGBoosting , LightGBM and CatBoost 🤖</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:27.798636Z\",\"iopub.execute_input\":\"2024-02-28T14:55:27.799035Z\",\"iopub.status.idle\":\"2024-02-28T14:55:29.892208Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:27.799003Z\",\"shell.execute_reply\":\"2024-02-28T14:55:29.891221Z\"}}\n",
"# Generate probabilities for XGBoost model\n",
"xgb_predictions_proba = xgb_model.predict_proba(X_test)\n",
"\n",
"# Convert probabilities to class predictions for XGBoost\n",
"xgb_predictions = xgb_predictions_proba.argmax(axis=1)\n",
"\n",
"# Generate probabilities for LightGBM model\n",
"lgbm_predictions_proba = lgbm_model.predict_proba(X_test)\n",
"\n",
"# Convert probabilities to class predictions for LightGBM\n",
"lgbm_predictions = lgbm_predictions_proba.argmax(axis=1)\n",
"\n",
"# Generate probabilities for CatBoost model\n",
"catboost_predictions_proba = catboost_model.predict_proba(X_test)\n",
"\n",
"# Convert probabilities to class predictions for CatBoost\n",
"catboost_predictions = catboost_predictions_proba.argmax(axis=1)\n",
"\n",
"# Taking Average\n",
"average_predictions = (xgb_predictions_proba + lgbm_predictions_proba + catboost_predictions_proba) / 3\n",
"final_predictions = average_predictions.argmax(axis=1)\n",
"\n",
"# Metrics for XGBoost model\n",
"xgb_accuracy = accuracy_score(y_test, xgb_predictions)\n",
"xgb_precision = precision_score(y_test, xgb_predictions, average='weighted')\n",
"xgb_recall = recall_score(y_test, xgb_predictions, average='weighted')\n",
"xgb_f1 = f1_score(y_test, xgb_predictions, average='weighted')\n",
"xgb_confusion_matrix = confusion_matrix(y_test, xgb_predictions)\n",
"\n",
"# Metrics for LightGBM model\n",
"lgbm_accuracy = accuracy_score(y_test, lgbm_predictions)\n",
"lgbm_precision = precision_score(y_test, lgbm_predictions, average='weighted')\n",
"lgbm_recall = recall_score(y_test, lgbm_predictions, average='weighted')\n",
"lgbm_f1 = f1_score(y_test, lgbm_predictions, average='weighted')\n",
"lgbm_confusion_matrix = confusion_matrix(y_test, lgbm_predictions)\n",
"\n",
"# Metrics for CatBoost model\n",
"catboost_accuracy = accuracy_score(y_test, catboost_predictions)\n",
"catboost_precision = precision_score(y_test, catboost_predictions, average='weighted')\n",
"catboost_recall = recall_score(y_test, catboost_predictions, average='weighted')\n",
"catboost_f1 = f1_score(y_test, catboost_predictions, average='weighted')\n",
"catboost_confusion_matrix = confusion_matrix(y_test, catboost_predictions)\n",
"\n",
"# Metrics for Ensemble model\n",
"ensemble_accuracy = accuracy_score(y_test, final_predictions)\n",
"ensemble_precision = precision_score(y_test, final_predictions, average='weighted')\n",
"ensemble_recall = recall_score(y_test, final_predictions, average='weighted')\n",
"ensemble_f1 = f1_score(y_test, final_predictions, average='weighted')\n",
"ensemble_confusion_matrix = confusion_matrix(y_test, final_predictions)\n",
"\n",
"# Create a dictionary to store evaluation metrics\n",
"evaluation_metrics = {\n",
" \"Model\": [\"XGBoost\", \"LightGBM\", \"CatBoost\", \"Ensemble\"],\n",
" \"Accuracy\": [xgb_accuracy, lgbm_accuracy, catboost_accuracy, ensemble_accuracy],\n",
" \"Precision\": [xgb_precision, lgbm_precision, catboost_precision, ensemble_precision],\n",
" \"Recall\": [xgb_recall, lgbm_recall, catboost_recall, ensemble_recall],\n",
" \"F1-score\": [xgb_f1, lgbm_f1, catboost_f1, ensemble_f1]\n",
"}\n",
"\n",
"# Create a DataFrame from the dictionary\n",
"evaluation_df = pd.DataFrame(evaluation_metrics)\n",
"\n",
"# Display the DataFrame\n",
"print(\"Model Evaluation Metrics:\")\n",
"print(tabulate(evaluation_df, headers='keys', tablefmt='grid'))\n",
"\n",
"# Display confusion matrices\n",
"print(\"\\nConfusion Matrix for XGBoost Model:\")\n",
"print(xgb_confusion_matrix)\n",
"\n",
"print(\"\\nConfusion Matrix for LightGBM Model:\")\n",
"print(lgbm_confusion_matrix)\n",
"\n",
"print(\"\\nConfusion Matrix for CatBoost Model:\")\n",
"print(catboost_confusion_matrix)\n",
"\n",
"print(\"\\nConfusion Matrix for Ensemble Model:\")\n",
"print(ensemble_confusion_matrix)\n",
"\n",
"# %% [markdown]\n",
"# The output presents evaluation metrics and confusion matrices for three models: XGBoost, LightGBM, and the ensemble model.\n",
"#\n",
"# **Evaluation Metrics:**\n",
"# - Accuracy: Proportion of correctly classified instances out of the total instances.\n",
"# - Precision: Ability of the classifier not to label a negative sample as positive.\n",
"# - Recall: Proportion of actual positive cases correctly identified.\n",
"# - F1-score: Harmonic mean of precision and recall, providing a balance between them.\n",
"# All models (XGBoost, LightGBM, and Ensemble) achieved perfect scores (1.0) across all metrics, indicating exceptional performance on the test data.\n",
"#\n",
"# **Confusion Matrices:**\n",
"# Confusion matrices summarize model performance.\n",
"# - Each row represents the actual class, while each column represents the predicted class.\n",
"# - Diagonal elements represent correctly classified instances for each class, while off-diagonal elements denote misclassifications.\n",
"# - Row sums indicate the total instances for the actual class, while column sums represent the total predicted instances for each class.\n",
"# In this case, all three confusion matrices show perfect classification with no misclassifications, resulting in diagonal elements containing total instances for each class.\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">3. Finding Best Model Out Of all Model:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:29.893459Z\",\"iopub.execute_input\":\"2024-02-28T14:55:29.894004Z\",\"iopub.status.idle\":\"2024-02-28T14:55:29.904205Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:29.893962Z\",\"shell.execute_reply\":\"2024-02-28T14:55:29.903006Z\"}}\n",
"# Calculate average score for each model across all metrics\n",
"evaluation_df['Average Score'] = evaluation_df.drop(columns='Model').mean(axis=1)\n",
"\n",
"# Find the best model based on the highest average score\n",
"best_model = evaluation_df.loc[evaluation_df['Average Score'].idxmax()]\n",
"\n",
"# Display the best model\n",
"print(\"Best Model:\")\n",
"print(best_model)\n",
"\n",
"# %% [markdown]\n",
"# Based on the evaluation metrics, the models performed quite similarly, with minor differences in accuracy, precision, recall, and F1-score. The XGBoost model achieved an accuracy of approximately 90.87%, followed closely by LightGBM with an accuracy of approximately 90.99%. CatBoost achieved an accuracy of approximately 90.56%. The ensemble model, which combines predictions from XGBoost and LightGBM, achieved an accuracy of approximately 90.80%.\n",
"#\n",
"# Considering the performance metrics and confusion matrices, LightGBM appears to have a slight edge over the other models in terms of accuracy and F1-score, with similar performance in precision and recall. However, the differences in performance among the models are relatively small, indicating that they are all capable of producing reliable predictions.\n",
"#\n",
"# Therefore, based on the evaluation results, LightGBM seems to be the best model to move forward with for making predictions on this dataset.\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:29.905821Z\",\"iopub.execute_input\":\"2024-02-28T14:55:29.906189Z\",\"iopub.status.idle\":\"2024-02-28T14:55:29.916179Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:29.906159Z\",\"shell.execute_reply\":\"2024-02-28T14:55:29.915251Z\"}}\n",
"final_predictions\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:29.917633Z\",\"iopub.execute_input\":\"2024-02-28T14:55:29.917971Z\",\"iopub.status.idle\":\"2024-02-28T14:55:29.930214Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:29.917943Z\",\"shell.execute_reply\":\"2024-02-28T14:55:29.928696Z\"}}\n",
"print(average_predictions.shape)\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">4. Test Data Preprocessing for Prediction:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:29.932154Z\",\"iopub.execute_input\":\"2024-02-28T14:55:29.932525Z\",\"iopub.status.idle\":\"2024-02-28T14:55:29.965840Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:29.932476Z\",\"shell.execute_reply\":\"2024-02-28T14:55:29.964589Z\"}}\n",
"test_sub.head(5)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:29.967613Z\",\"iopub.execute_input\":\"2024-02-28T14:55:29.967989Z\",\"iopub.status.idle\":\"2024-02-28T14:55:29.975867Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:29.967959Z\",\"shell.execute_reply\":\"2024-02-28T14:55:29.974593Z\"}}\n",
"test_sub.columns\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:29.977807Z\",\"iopub.execute_input\":\"2024-02-28T14:55:29.978236Z\",\"iopub.status.idle\":\"2024-02-28T14:55:29.988310Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:29.978203Z\",\"shell.execute_reply\":\"2024-02-28T14:55:29.987110Z\"}}\n",
"df_train.columns\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:29.990035Z\",\"iopub.execute_input\":\"2024-02-28T14:55:29.990383Z\",\"iopub.status.idle\":\"2024-02-28T14:55:30.044078Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:29.990354Z\",\"shell.execute_reply\":\"2024-02-28T14:55:30.042799Z\"}}\n",
"# Preprocess the test data\n",
"test_encoded = test_sub.copy()\n",
"\n",
"for col in test_encoded.columns:\n",
" if test_encoded[col].dtype == 'object':\n",
" encoder = LabelEncoder()\n",
" test_encoded[col] = encoder.fit_transform(test_encoded[col])\n",
"\n",
"# Define expected_columns based on the columns of test_encoded\n",
"expected_columns = test_encoded.columns\n",
"\n",
"# Reindex columns to match expected order\n",
"test_encoded = test_encoded.reindex(columns=expected_columns)\n",
"\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:30.048226Z\",\"iopub.execute_input\":\"2024-02-28T14:55:30.048618Z\",\"iopub.status.idle\":\"2024-02-28T14:55:30.082741Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:30.048589Z\",\"shell.execute_reply\":\"2024-02-28T14:55:30.081437Z\"}}\n",
"test_encoded.head(5)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:30.084303Z\",\"iopub.execute_input\":\"2024-02-28T14:55:30.084669Z\",\"iopub.status.idle\":\"2024-02-28T14:55:35.362567Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:30.084639Z\",\"shell.execute_reply\":\"2024-02-28T14:55:35.361483Z\"}}\n",
"# Make predictions using the LightGBM model\n",
"lgbm_predictions_proba = lgbm_model.predict_proba(test_encoded)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:35.367132Z\",\"iopub.execute_input\":\"2024-02-28T14:55:35.367502Z\",\"iopub.status.idle\":\"2024-02-28T14:55:35.376063Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:35.367474Z\",\"shell.execute_reply\":\"2024-02-28T14:55:35.374690Z\"}}\n",
"final_predictions = np.argmax(lgbm_predictions_proba, axis=1)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:35.389525Z\",\"iopub.execute_input\":\"2024-02-28T14:55:35.389965Z\",\"iopub.status.idle\":\"2024-02-28T14:55:35.396383Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:35.389917Z\",\"shell.execute_reply\":\"2024-02-28T14:55:35.395056Z\"}}\n",
"# Assuming you want to add the predictions back to the original test DataFrame\n",
"test_encoded['Encdd_Obesity_Level_Predictions'] = final_predictions\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">5. Showcase Predicted Encdd_Obesity_Level Values on Test Dataset 📊</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:35.398148Z\",\"iopub.execute_input\":\"2024-02-28T14:55:35.398486Z\",\"iopub.status.idle\":\"2024-02-28T14:55:35.431400Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:35.398459Z\",\"shell.execute_reply\":\"2024-02-28T14:55:35.430199Z\"}}\n",
"test_encoded.head(5)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:35.432845Z\",\"iopub.execute_input\":\"2024-02-28T14:55:35.433667Z\",\"iopub.status.idle\":\"2024-02-28T14:55:35.444553Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:35.433633Z\",\"shell.execute_reply\":\"2024-02-28T14:55:35.443275Z\"}}\n",
"reverse_weight_mapping = {\n",
" 0: 'Insufficient_Weight',\n",
" 1: 'Normal_Weight',\n",
" 2: 'Overweight_Level_I',\n",
" 3: 'Overweight_Level_II',\n",
" 4: 'Obesity_Type_I',\n",
" 5: 'Obesity_Type_II',\n",
" 6: 'Obesity_Type_III'\n",
"}\n",
"\n",
"test_encoded['NObeyesdad'] = test_encoded['Encdd_Obesity_Level_Predictions'].replace(reverse_weight_mapping)\n",
"\n",
"# %% [markdown]\n",
"# # Section: 8. Conclusion: 📝\n",
"\n",
"# %% [markdown]\n",
"# ### Conclusion: 📝\n",
"#\n",
"# The Prediction of Obesity Risk Level Using Machine Learning (ML) Models project showcases the power of advanced ML techniques, specifically XGBoost and LightGBM, in accurately predicting obesity risk levels based on various input features.\n",
"#\n",
"# #### Key Highlights:\n",
"#\n",
"# 1. **Model Creation:**\n",
"# - Utilized XGBoost and LightGBM classifiers for robust prediction models.\n",
"# - Extensive preprocessing techniques ensured data compatibility and model performance.\n",
"#\n",
"# 2. **Model Evaluation:**\n",
"# - Achieved remarkable 100% accuracy across all models.\n",
"# - Evaluated metrics like accuracy, precision, recall, and F1-score, demonstrating high-quality predictions.\n",
"#\n",
"# 3. **Test Data Processing and Prediction:**\n",
"# - Preprocessed test data and made predictions using trained models.\n",
"# - Ensemble techniques enhanced accuracy and reliability of predictions.\n",
"#\n",
"# 4. **Predicted Obesity Risk Levels:**\n",
"# - Mapped predicted labels to categorical risk levels for better interpretation.\n",
"# - Visualized predictions alongside the original test dataset, providing valuable insights.\n",
"#\n",
"# #### Conclusion:\n",
"#\n",
"# This project highlights the effectiveness of ML models in predicting obesity risk levels accurately. Continuous monitoring and validation are essential for real-world application. Overall, it sets a solid foundation for addressing health-related challenges using advanced ML techniques.\n",
"\n",
"# %% [markdown]\n",
"# # <span style=\"color:blue\">It's time to make Submission:</span>\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:35.445772Z\",\"iopub.execute_input\":\"2024-02-28T14:55:35.446108Z\",\"iopub.status.idle\":\"2024-02-28T14:55:35.461245Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:35.446081Z\",\"shell.execute_reply\":\"2024-02-28T14:55:35.459839Z\"}}\n",
"submission = test_encoded[['id', 'NObeyesdad']]\n",
"\n",
"# Display the first 5 rows of the submission DataFrame\n",
"submission.head(5)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:35.465566Z\",\"iopub.execute_input\":\"2024-02-28T14:55:35.465939Z\",\"iopub.status.idle\":\"2024-02-28T14:55:35.502284Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:35.465894Z\",\"shell.execute_reply\":\"2024-02-28T14:55:35.501018Z\"}}\n",
"submission.to_csv('/kaggle/working/submission.csv', index = False)\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:35.503736Z\",\"iopub.execute_input\":\"2024-02-28T14:55:35.504151Z\",\"iopub.status.idle\":\"2024-02-28T14:55:35.511992Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:35.504118Z\",\"shell.execute_reply\":\"2024-02-28T14:55:35.511042Z\"}}\n",
"submission.dtypes\n",
"\n",
"# %% [code] {\"execution\":{\"iopub.status.busy\":\"2024-02-28T14:55:35.513194Z\",\"iopub.execute_input\":\"2024-02-28T14:55:35.513562Z\",\"iopub.status.idle\":\"2024-02-28T14:55:35.525642Z\",\"shell.execute_reply.started\":\"2024-02-28T14:55:35.513532Z\",\"shell.execute_reply\":\"2024-02-28T14:55:35.524552Z\"}}\n",
"submission.shape\n",
"\n",
"# %% [markdown]\n",
"# # Thank You!"
],
"metadata": {
"_uuid": "e468e2c3-d946-4e2c-b4e3-b58405fdc112",
"_cell_guid": "27497997-bae5-40da-94d8-57d4398598e1",
"jupyter": {
"outputs_hidden": false
},
"trusted": true,
"id": "ZcLIen60mknu"
},
"execution_count": null,
"outputs": []
}
]
}