--- a +++ b/strokepred.py @@ -0,0 +1,75 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline +from sklearn.impute import SimpleImputer +from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier +from sklearn.metrics import f1_score, classification_report +from sklearn.model_selection import GridSearchCV + +# Load the datasets +train_set = pd.read_csv('/path/to/stroke_train_set.csv') +test_set = pd.read_csv('/path/to/stroke_test_set_nogt.csv') + +# Data Preprocessing +# Separating the target variable and features in the training set +X = train_set.drop('stroke', axis=1) +y = train_set['stroke'] + +# Identifying numerical and categorical columns +numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns +categorical_cols = X.select_dtypes(include=['object']).columns + +# Creating a preprocessor with transformations for different column types +preprocessor = ColumnTransformer( + transformers=[ + ('num', SimpleImputer(strategy='median'), numerical_cols), + ('cat', OneHotEncoder(), categorical_cols) + ]) + +# Creating a pipeline with preprocessing and a classifier +model = Pipeline(steps=[('preprocessor', preprocessor), + ('classifier', RandomForestClassifier(random_state=42))]) + +# Splitting the dataset into training and validation sets +X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) + +# Fit the model +model.fit(X_train, y_train) + +# Predict on validation set and evaluate +y_pred = model.predict(X_val) +f1 = f1_score(y_val, y_pred) +print(f"F1 Score: {f1}") +print(classification_report(y_val, y_pred)) + +# Hyperparameter Tuning and Alternative Model +model_gb = Pipeline(steps=[ + ('preprocessor', preprocessor), + ('classifier', GradientBoostingClassifier(random_state=42)) +]) + +param_grid = { + 'classifier__n_estimators': [100, 200], + 'classifier__learning_rate': [0.01, 0.1], + 'classifier__max_depth': [3, 5] +} + +grid_search = GridSearchCV(model_gb, param_grid, cv=5, scoring='f1', n_jobs=-1) +grid_search.fit(X_train, y_train) + +# Best parameters and model performance +best_params = grid_search.best_params_ +y_pred_gb = grid_search.predict(X_val) +f1_gb = f1_score(y_val, y_pred_gb) +print(f"Best Parameters: {best_params}") +print(f"F1 Score (Gradient Boosting): {f1_gb}") +print(classification_report(y_val, y_pred_gb)) + +# Preparing final predictions on the test set +final_predictions = model.predict(test_set) # Using the best model from above +submission_df = pd.DataFrame({'ID': test_set.index, 'stroke': final_predictions}) + +# Saving the submission file +submission_df.to_csv('/path/to/final_submission.csv', index=False)