Switch to unified view

a b/03-Experiments/Utils/utils.py
1
import pandas as pd
2
from sklearn.model_selection import train_test_split
3
import matplotlib.pyplot as plt
4
import seaborn as sns
5
import numpy as np
6
from sklearn.preprocessing import MinMaxScaler
7
from sklearn.preprocessing import PolynomialFeatures
8
9
def load_data(path):
10
    df = pd.read_csv(path)
11
    train_df, test_df = train_test_split(df, test_size=0.35, random_state=42)
12
    train_df, val_df,  = train_test_split(train_df, test_size=0.20, random_state=42)
13
    train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)
14
    test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)
15
    val_df = val_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True)
16
    return train_df, val_df, test_df
17
18
def encode_target(train):
19
    target_key = {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2, 'Overweight_Level_II': 3, 'Obesity_Type_I': 4,'Obesity_Type_II' : 5, 'Obesity_Type_III': 6}
20
    train['NObeyesdad'] = train['NObeyesdad'].map(target_key)
21
    return train
22
23
def datatypes(train):
24
    train['Weight'] = train['Weight'].astype(float)
25
    train['Age'] = train['Age'].astype(float)
26
    train['Height'] = train['Height'].astype(float)
27
    return train
28
29
def age_binning(train_df):
30
    train_df['Age_Group'] = pd.cut(train_df['Age'], bins=[0, 20, 30, 40, 50, train_df['Age'].max()], labels=['0-20', '21-30', '31-40', '41-50', '50+'])
31
    return train_df
32
33
def age_scaling_log(train_df):
34
    train_df['Age'] = train_df['Age'].astype(float)
35
    train_df['Log_Age'] = np.log1p(train_df['Age'])
36
    return train_df
37
38
def age_scaling_minmax(train_df):
39
    train_df['Age'] = train_df['Age'].astype(float)
40
    scaler_age = MinMaxScaler()
41
    train_df['Scaled_Age'] = scaler_age.fit_transform(train_df['Age'].values.reshape(-1, 1))
42
    return train_df, scaler_age
43
44
def weight_scaling_log(train_df):
45
    train_df['Weight'] = train_df['Weight'].astype(float)
46
    train_df['Log_Weight'] = np.log1p(train_df['Weight'])
47
    return train_df
48
49
def weight_scaling_minmax(train_df):
50
    train_df['Weight'] = train_df['Weight'].astype(float)
51
    scaler_weight = MinMaxScaler()
52
    train_df['Scaled_Weight'] = scaler_weight.fit_transform(train_df['Weight'].values.reshape(-1, 1))
53
    return train_df, scaler_weight
54
55
def height_scaling_log(train_df):
56
    train_df['Log_Height'] = np.log1p(train_df['Height'])
57
    return train_df
58
59
def height_scaling_minmax(train_df):
60
    scaler_height = MinMaxScaler()
61
    train_df['Scaled_Height'] = scaler_height.fit_transform(train_df['Height'].values.reshape(-1, 1))
62
    return train_df, scaler_height
63
64
def other_features(train):
65
    # Print data types of 'Age' and 'Gender' columns
66
    print("Data types:")
67
    print(train[['Age', 'Gender']].dtypes)
68
69
    # Check for non-numeric values in 'Age' and 'Gender' columns
70
    print("\nUnique values in 'Age' column:")
71
    print(train['Age'].unique())
72
    print("\nUnique values in 'Gender' column:")
73
    print(train['Gender'].unique())
74
75
    # Perform multiplication
76
    train['BMI'] = train['Weight'] / (train['Height'] ** 2)
77
    train = make_gender_binary(train)
78
    train['Age * Gender'] = train['Age'] * train['Gender']   
79
    categorical_features = ['family_history_with_overweight', 'Age_Group', 'FAVC','CAEC', 'SMOKE','SCC', 'CALC', 'MTRANS']
80
    train = pd.get_dummies(train, columns=categorical_features)
81
    polynomial_features = PolynomialFeatures(degree=2)
82
    X_poly = polynomial_features.fit_transform(train[['Age', 'BMI']])
83
    poly_features_df = pd.DataFrame(X_poly, columns=['Age^2', 'Age^3', 'BMI^2', 'Age * BMI', 'Age * BMI^2', 'Age^2 * BMI^2'])
84
    train = pd.concat([train, poly_features_df], axis=1)
85
    return train
86
87
def test_pipeline(test, scaler_age, scaler_weight, scaler_height):
88
    test = encode_target(test)
89
    test = age_binning(test)
90
    test = age_scaling_log(test)
91
    test['Scaled_Age'] = scaler_age.transform(test['Age'].values.reshape(-1, 1))
92
    test = weight_scaling_log(test)
93
    test['Scaled_Weight'] = scaler_weight.transform(test['Weight'].values.reshape(-1, 1))
94
    test = height_scaling_log(test)
95
    test['Scaled_Height'] = scaler_height.transform(test['Height'].values.reshape(-1, 1))
96
    test = other_features(test)
97
    return test
98