--- a +++ b/diabetes-api/train_model.py @@ -0,0 +1,53 @@ +import pandas as pd +from sklearn.model_selection import train_test_split, GridSearchCV +from sklearn.ensemble import RandomForestClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline +from sklearn.metrics import accuracy_score +import joblib + +# Load dataset +df = pd.read_csv('diabetes.csv') + +# Handle missing values (replace zeros with NaN for specific columns) +columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'] +df[columns_to_replace] = df[columns_to_replace].replace(0, pd.NA) + +# Fill missing values with the median +df.fillna(df.median(), inplace=True) + +# Separate features and labels +X = df.drop('Outcome', axis=1) +y = df['Outcome'] + +# Train/test split +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# Create a pipeline with scaling and Random Forest +pipeline = Pipeline([ + ('scaler', StandardScaler()), + ('classifier', RandomForestClassifier(random_state=42)) +]) + +# Hyperparameter tuning +param_grid = { + 'classifier__n_estimators': [100, 200, 300], + 'classifier__max_depth': [5, 10, 15], + 'classifier__min_samples_split': [2, 5, 10], + 'classifier__min_samples_leaf': [1, 2, 4] +} + +grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1) +grid_search.fit(X_train, y_train) + +# Best model +best_model = grid_search.best_estimator_ + +# Evaluate +y_pred = best_model.predict(X_test) +acc = accuracy_score(y_test, y_pred) +print(f"Accuracy: {acc * 100:.2f}%") + +# Save the model +joblib.dump(best_model, 'diabetes_model.pkl') +print("Model saved as diabetes_model.pkl")