|
a |
|
b/diabetes-api/train_model.py |
|
|
1 |
import pandas as pd |
|
|
2 |
from sklearn.model_selection import train_test_split, GridSearchCV |
|
|
3 |
from sklearn.ensemble import RandomForestClassifier |
|
|
4 |
from sklearn.preprocessing import StandardScaler |
|
|
5 |
from sklearn.pipeline import Pipeline |
|
|
6 |
from sklearn.metrics import accuracy_score |
|
|
7 |
import joblib |
|
|
8 |
|
|
|
9 |
# Load dataset |
|
|
10 |
df = pd.read_csv('diabetes.csv') |
|
|
11 |
|
|
|
12 |
# Handle missing values (replace zeros with NaN for specific columns) |
|
|
13 |
columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'] |
|
|
14 |
df[columns_to_replace] = df[columns_to_replace].replace(0, pd.NA) |
|
|
15 |
|
|
|
16 |
# Fill missing values with the median |
|
|
17 |
df.fillna(df.median(), inplace=True) |
|
|
18 |
|
|
|
19 |
# Separate features and labels |
|
|
20 |
X = df.drop('Outcome', axis=1) |
|
|
21 |
y = df['Outcome'] |
|
|
22 |
|
|
|
23 |
# Train/test split |
|
|
24 |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
25 |
|
|
|
26 |
# Create a pipeline with scaling and Random Forest |
|
|
27 |
pipeline = Pipeline([ |
|
|
28 |
('scaler', StandardScaler()), |
|
|
29 |
('classifier', RandomForestClassifier(random_state=42)) |
|
|
30 |
]) |
|
|
31 |
|
|
|
32 |
# Hyperparameter tuning |
|
|
33 |
param_grid = { |
|
|
34 |
'classifier__n_estimators': [100, 200, 300], |
|
|
35 |
'classifier__max_depth': [5, 10, 15], |
|
|
36 |
'classifier__min_samples_split': [2, 5, 10], |
|
|
37 |
'classifier__min_samples_leaf': [1, 2, 4] |
|
|
38 |
} |
|
|
39 |
|
|
|
40 |
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1) |
|
|
41 |
grid_search.fit(X_train, y_train) |
|
|
42 |
|
|
|
43 |
# Best model |
|
|
44 |
best_model = grid_search.best_estimator_ |
|
|
45 |
|
|
|
46 |
# Evaluate |
|
|
47 |
y_pred = best_model.predict(X_test) |
|
|
48 |
acc = accuracy_score(y_test, y_pred) |
|
|
49 |
print(f"Accuracy: {acc * 100:.2f}%") |
|
|
50 |
|
|
|
51 |
# Save the model |
|
|
52 |
joblib.dump(best_model, 'diabetes_model.pkl') |
|
|
53 |
print("Model saved as diabetes_model.pkl") |