--- a +++ b/Patient-Risk-Model_LR_RF_XGB.py @@ -0,0 +1,53 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from xgboost import XGBClassifier +from sklearn.metrics import accuracy_score, classification_report + +# Load dataset (replace with actual dataset path) +df = pd.read_csv("healthcare_data.csv") + +# Assume target column is 'Risk' (0: Low, 1: High) and drop non-numeric or identifier columns +y = df['Risk'] +X = df.drop(columns=['Risk', 'PatientID']) + +# Handle missing values (simple imputation with mean) +X = X.fillna(X.mean()) + +# Split dataset +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# Standardize features +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) + +# Model 1: Logistic Regression +log_model = LogisticRegression() +log_model.fit(X_train_scaled, y_train) +y_pred_log = log_model.predict(X_test_scaled) +print("Logistic Regression Performance:") +print(classification_report(y_test, y_pred_log)) + +# Model 2: Random Forest +rf_model = RandomForestClassifier(n_estimators=100, random_state=42) +rf_model.fit(X_train, y_train) +y_pred_rf = rf_model.predict(X_test) +print("Random Forest Performance:") +print(classification_report(y_test, y_pred_rf)) + +# Model 3: XGBoost +xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss') +xgb_model.fit(X_train, y_train) +y_pred_xgb = xgb_model.predict(X_test) +print("XGBoost Performance:") +print(classification_report(y_test, y_pred_xgb)) + +# Compare Model Accuracies +print("Model Accuracies:") +print(f"Logistic Regression: {accuracy_score(y_test, y_pred_log):.4f}") +print(f"Random Forest: {accuracy_score(y_test, y_pred_rf):.4f}") +print(f"XGBoost: {accuracy_score(y_test, y_pred_xgb):.4f}")