|
a |
|
b/03-Experiments/Utils/utils.py |
|
|
1 |
import pandas as pd |
|
|
2 |
from sklearn.model_selection import train_test_split |
|
|
3 |
import matplotlib.pyplot as plt |
|
|
4 |
import seaborn as sns |
|
|
5 |
import numpy as np |
|
|
6 |
from sklearn.preprocessing import MinMaxScaler |
|
|
7 |
from sklearn.preprocessing import PolynomialFeatures |
|
|
8 |
|
|
|
9 |
def load_data(path): |
|
|
10 |
df = pd.read_csv(path) |
|
|
11 |
train_df, test_df = train_test_split(df, test_size=0.35, random_state=42) |
|
|
12 |
train_df, val_df, = train_test_split(train_df, test_size=0.20, random_state=42) |
|
|
13 |
train_df = train_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True) |
|
|
14 |
test_df = test_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True) |
|
|
15 |
val_df = val_df.drop(['id'], axis=1).drop_duplicates().reset_index(drop=True) |
|
|
16 |
return train_df, val_df, test_df |
|
|
17 |
|
|
|
18 |
def encode_target(train): |
|
|
19 |
target_key = {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2, 'Overweight_Level_II': 3, 'Obesity_Type_I': 4,'Obesity_Type_II' : 5, 'Obesity_Type_III': 6} |
|
|
20 |
train['NObeyesdad'] = train['NObeyesdad'].map(target_key) |
|
|
21 |
return train |
|
|
22 |
|
|
|
23 |
def datatypes(train): |
|
|
24 |
train['Weight'] = train['Weight'].astype(float) |
|
|
25 |
train['Age'] = train['Age'].astype(float) |
|
|
26 |
train['Height'] = train['Height'].astype(float) |
|
|
27 |
return train |
|
|
28 |
|
|
|
29 |
def age_binning(train_df): |
|
|
30 |
train_df['Age_Group'] = pd.cut(train_df['Age'], bins=[0, 20, 30, 40, 50, train_df['Age'].max()], labels=['0-20', '21-30', '31-40', '41-50', '50+']) |
|
|
31 |
return train_df |
|
|
32 |
|
|
|
33 |
def age_scaling_log(train_df): |
|
|
34 |
train_df['Age'] = train_df['Age'].astype(float) |
|
|
35 |
train_df['Log_Age'] = np.log1p(train_df['Age']) |
|
|
36 |
return train_df |
|
|
37 |
|
|
|
38 |
def age_scaling_minmax(train_df): |
|
|
39 |
train_df['Age'] = train_df['Age'].astype(float) |
|
|
40 |
scaler_age = MinMaxScaler() |
|
|
41 |
train_df['Scaled_Age'] = scaler_age.fit_transform(train_df['Age'].values.reshape(-1, 1)) |
|
|
42 |
return train_df, scaler_age |
|
|
43 |
|
|
|
44 |
def weight_scaling_log(train_df): |
|
|
45 |
train_df['Weight'] = train_df['Weight'].astype(float) |
|
|
46 |
train_df['Log_Weight'] = np.log1p(train_df['Weight']) |
|
|
47 |
return train_df |
|
|
48 |
|
|
|
49 |
def weight_scaling_minmax(train_df): |
|
|
50 |
train_df['Weight'] = train_df['Weight'].astype(float) |
|
|
51 |
scaler_weight = MinMaxScaler() |
|
|
52 |
train_df['Scaled_Weight'] = scaler_weight.fit_transform(train_df['Weight'].values.reshape(-1, 1)) |
|
|
53 |
return train_df, scaler_weight |
|
|
54 |
|
|
|
55 |
def height_scaling_log(train_df): |
|
|
56 |
train_df['Log_Height'] = np.log1p(train_df['Height']) |
|
|
57 |
return train_df |
|
|
58 |
|
|
|
59 |
def height_scaling_minmax(train_df): |
|
|
60 |
scaler_height = MinMaxScaler() |
|
|
61 |
train_df['Scaled_Height'] = scaler_height.fit_transform(train_df['Height'].values.reshape(-1, 1)) |
|
|
62 |
return train_df, scaler_height |
|
|
63 |
|
|
|
64 |
def other_features(train): |
|
|
65 |
# Print data types of 'Age' and 'Gender' columns |
|
|
66 |
print("Data types:") |
|
|
67 |
print(train[['Age', 'Gender']].dtypes) |
|
|
68 |
|
|
|
69 |
# Check for non-numeric values in 'Age' and 'Gender' columns |
|
|
70 |
print("\nUnique values in 'Age' column:") |
|
|
71 |
print(train['Age'].unique()) |
|
|
72 |
print("\nUnique values in 'Gender' column:") |
|
|
73 |
print(train['Gender'].unique()) |
|
|
74 |
|
|
|
75 |
# Perform multiplication |
|
|
76 |
train['BMI'] = train['Weight'] / (train['Height'] ** 2) |
|
|
77 |
train = make_gender_binary(train) |
|
|
78 |
train['Age * Gender'] = train['Age'] * train['Gender'] |
|
|
79 |
categorical_features = ['family_history_with_overweight', 'Age_Group', 'FAVC','CAEC', 'SMOKE','SCC', 'CALC', 'MTRANS'] |
|
|
80 |
train = pd.get_dummies(train, columns=categorical_features) |
|
|
81 |
polynomial_features = PolynomialFeatures(degree=2) |
|
|
82 |
X_poly = polynomial_features.fit_transform(train[['Age', 'BMI']]) |
|
|
83 |
poly_features_df = pd.DataFrame(X_poly, columns=['Age^2', 'Age^3', 'BMI^2', 'Age * BMI', 'Age * BMI^2', 'Age^2 * BMI^2']) |
|
|
84 |
train = pd.concat([train, poly_features_df], axis=1) |
|
|
85 |
return train |
|
|
86 |
|
|
|
87 |
def test_pipeline(test, scaler_age, scaler_weight, scaler_height): |
|
|
88 |
test = encode_target(test) |
|
|
89 |
test = age_binning(test) |
|
|
90 |
test = age_scaling_log(test) |
|
|
91 |
test['Scaled_Age'] = scaler_age.transform(test['Age'].values.reshape(-1, 1)) |
|
|
92 |
test = weight_scaling_log(test) |
|
|
93 |
test['Scaled_Weight'] = scaler_weight.transform(test['Weight'].values.reshape(-1, 1)) |
|
|
94 |
test = height_scaling_log(test) |
|
|
95 |
test['Scaled_Height'] = scaler_height.transform(test['Height'].values.reshape(-1, 1)) |
|
|
96 |
test = other_features(test) |
|
|
97 |
return test |
|
|
98 |
|