a b/diabetes-api/train_model.py
1
import pandas as pd
2
from sklearn.model_selection import train_test_split, GridSearchCV
3
from sklearn.ensemble import RandomForestClassifier
4
from sklearn.preprocessing import StandardScaler
5
from sklearn.pipeline import Pipeline
6
from sklearn.metrics import accuracy_score
7
import joblib
8
9
# Load dataset
10
df = pd.read_csv('diabetes.csv')
11
12
# Handle missing values (replace zeros with NaN for specific columns)
13
columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
14
df[columns_to_replace] = df[columns_to_replace].replace(0, pd.NA)
15
16
# Fill missing values with the median
17
df.fillna(df.median(), inplace=True)
18
19
# Separate features and labels
20
X = df.drop('Outcome', axis=1)
21
y = df['Outcome']
22
23
# Train/test split
24
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
25
26
# Create a pipeline with scaling and Random Forest
27
pipeline = Pipeline([
28
    ('scaler', StandardScaler()),
29
    ('classifier', RandomForestClassifier(random_state=42))
30
])
31
32
# Hyperparameter tuning
33
param_grid = {
34
    'classifier__n_estimators': [100, 200, 300],
35
    'classifier__max_depth': [5, 10, 15],
36
    'classifier__min_samples_split': [2, 5, 10],
37
    'classifier__min_samples_leaf': [1, 2, 4]
38
}
39
40
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
41
grid_search.fit(X_train, y_train)
42
43
# Best model
44
best_model = grid_search.best_estimator_
45
46
# Evaluate
47
y_pred = best_model.predict(X_test)
48
acc = accuracy_score(y_test, y_pred)
49
print(f"Accuracy: {acc * 100:.2f}%")
50
51
# Save the model
52
joblib.dump(best_model, 'diabetes_model.pkl')
53
print("Model saved as diabetes_model.pkl")