1265 lines (1264 with data), 55.3 kB
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Load Data",
"id": "6d1b372bd242a6f7"
},
{
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-09-18T10:13:09.095287Z",
"start_time": "2024-09-18T10:13:07.212667Z"
}
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from xgboost import XGBClassifier\n",
"\n",
"df = pd.read_excel('dataset.xlsx',engine='openpyxl')\n",
"df.drop(['Unnamed: 0', 'visit id'], axis=1, inplace=True)\n",
"from sklearn.model_selection import train_test_split\n",
"# X = df.drop(['target label / yes no'],axis=1)\n",
"# y = df['target label / yes no']\n",
"# df_train, df_test= train_test_split(df, test_size=0.2,shuffle=True, random_state=42)"
],
"id": "initial_id",
"outputs": [],
"execution_count": 2
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# Grid Search\n",
"Search for best paramters and log them in mlflow"
],
"id": "7ba2217da72ca9c1"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T11:29:19.406874Z",
"start_time": "2024-09-18T11:14:43.860088Z"
}
},
"cell_type": "code",
"source": [
"from utils import Preprocess, MissingValue\n",
"\n",
"from utils import MLModelSelector\n",
"\n",
"\n",
"preprocess_param_grid = {\n",
" 'missing_value_per': [0.1, 0.2, 0.3, 0.4, 0.5],\n",
" 'variance_threshold': [0.0, 0.01, 0.05, 0.1, 0.2],\n",
" 'min_null_per': [0.5]\n",
"}\n",
"\n",
"param_grids = {\n",
" # 'SVM': {\n",
" # 'C': [0.1, 1, 10], \n",
" # 'kernel': ['linear', 'rbf'] \n",
" # },\n",
" # 'logistic_regression': {\n",
" # 'C': [0.1, 1, 10], \n",
" # 'solver': ['lbfgs'], \n",
" # 'max_iter': [500, 1000, 2000] \n",
" # },\n",
" 'random_forest': {\n",
" 'n_estimators': [150, 250], # Number of trees in the forest\n",
" 'max_depth': [10, 20], # Maximum depth of the tree\n",
" 'min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node\n",
" 'min_samples_leaf': [1, 2], # Minimum number of samples required to be at a leaf node\n",
" 'bootstrap': [True], # Whether bootstrap samples are used when building trees\n",
"},\n",
" 'XGBoost': {\n",
" 'n_estimators': [100, 200], # Number of boosting rounds\n",
" 'max_depth': [3, 6], # Maximum depth of a tree\n",
" 'learning_rate': [0.1, 0.2], # Step size shrinkage used in update to prevent overfitting\n",
" 'subsample': [0.6, 1.0], # Fraction of samples used for training each tree\n",
" 'colsample_bytree': [0.8, 1.0], # Fraction of features used at each split\n",
"}\n",
"}\n",
"\n",
"best_preprocess_params = None\n",
"best_model_params = None\n",
"best_overall_score = -1\n",
"best_model_name = None\n",
"i = 0\n",
"for missing_value_per in preprocess_param_grid['missing_value_per']:\n",
" for variance_threshold in preprocess_param_grid['variance_threshold']:\n",
" for min_null_per in preprocess_param_grid['min_null_per']:\n",
" df_train, df_test= train_test_split(df, test_size=0.2,shuffle=True)\n",
" preprocess_params = {\n",
" 'missing_value_per': missing_value_per,\n",
" 'variance_threshold': variance_threshold,\n",
" 'min_null_per': min_null_per\n",
" }\n",
" pre_data = Preprocess(\n",
" dataframe=df_train, \n",
" missing_value_per=missing_value_per, \n",
" variance_threshold=variance_threshold, \n",
" min_null_per=min_null_per\n",
" )\n",
" pre_data.apply()\n",
" df_train = pre_data.dataframe\n",
" missing = MissingValue(original_df=df_train,test_size=0.1)\n",
" df_train = missing.fill_dataframe()\n",
" cols = df_train.columns\n",
" X_train = df_train.drop(['target label / yes no'], axis=1)\n",
" y_train = df_train['target label / yes no']\n",
" \n",
" df_test = df_test[cols]\n",
" \n",
" df_test = pre_data._mapping(df_test)\n",
" missing = MissingValue(original_df=df_test,test_size=0.1)\n",
" df_test = missing.fill_dataframe()\n",
" X_test = df_test.drop(['target label / yes no'], axis=1)\n",
" y_test = df_test['target label / yes no'].values.astype(int)\n",
" \n",
" for model_name, param_grid in param_grids.items():\n",
" model_selector = MLModelSelector()\n",
" best_params, best_score = model_selector.train_model(\n",
" X_train, y_train,X_test,y_test, \n",
" model_name, \n",
" param_grid, \n",
" preprocess_parameters=preprocess_params \n",
" )\n",
" \n",
" if best_score > best_overall_score:\n",
" best_overall_score = best_score\n",
" best_preprocess_params = preprocess_params\n",
" best_model_params = best_params\n",
" best_model_name = model_name\n",
" \n",
" i = i + 1\n",
" print(i)\n",
" print(best_overall_score)\n",
"\n",
"print(f\"Best Preprocess Params: {best_preprocess_params}\")\n",
"print(f\"Best Model Params: {best_model_params}\")\n",
"print(f\"Best Overall Score: {best_overall_score}\")\n"
],
"id": "4228dc0652080dc5",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n",
"0.7454545454545455\n",
"2\n",
"0.7454545454545455\n",
"3\n",
"0.7484030554078361\n",
"4\n",
"0.7546958304853042\n",
"5\n",
"0.7546958304853042\n",
"6\n",
"0.7562392881887466\n",
"7\n",
"0.7562392881887466\n",
"8\n",
"0.7562392881887466\n",
"9\n",
"0.7562392881887466\n",
"10\n",
"0.7562392881887466\n",
"11\n",
"0.7562392881887466\n",
"12\n",
"0.7562392881887466\n",
"13\n",
"0.7562392881887466\n",
"14\n",
"0.7562392881887466\n",
"15\n",
"0.7562392881887466\n",
"16\n",
"0.8621799805212067\n",
"17\n",
"0.8621799805212067\n",
"18\n",
"0.8853833897195243\n",
"19\n",
"0.8853833897195243\n",
"20\n",
"0.8853833897195243\n",
"21\n",
"0.8853833897195243\n",
"22\n",
"0.8853833897195243\n",
"23\n",
"0.8853833897195243\n",
"24\n",
"0.8853833897195243\n",
"25\n",
"0.8853833897195243\n",
"Best Preprocess Params: {'missing_value_per': 0.4, 'variance_threshold': 0.05, 'min_null_per': 0.5}\n",
"Best Model Params: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 250}\n",
"Best Overall Score: 0.8853833897195243\n"
]
}
],
"execution_count": 16
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"# Train Best Model\n",
"Use the best parameter and save the model and other important elements"
],
"id": "8dd1bb70b6cb71e9"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T14:09:22.818872Z",
"start_time": "2024-09-18T14:09:22.798869Z"
}
},
"cell_type": "code",
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.preprocessing import StandardScaler\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from utils import Preprocess, MissingValue\n",
"import pickle\n",
"from sklearn.metrics import f1_score\n",
"from xgboost import XGBClassifier"
],
"id": "43b0c8f8f2c807ea",
"outputs": [],
"execution_count": 47
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Load Data",
"id": "c08e3333ceadab75"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T14:16:42.761730Z",
"start_time": "2024-09-18T14:16:37.848559Z"
}
},
"cell_type": "code",
"source": [
"df = pd.read_excel('dataset.xlsx',engine='openpyxl')\n",
"df.drop(['Unnamed: 0', 'visit id'], axis=1, inplace=True)"
],
"id": "58c520662874e959",
"outputs": [],
"execution_count": 58
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Load parameters ",
"id": "ddaeeb3e1d01160c"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T14:09:28.840294Z",
"start_time": "2024-09-18T14:09:28.830692Z"
}
},
"cell_type": "code",
"source": [
"best_preprocess_params = {'missing_value_per': 0.4, 'variance_threshold': 0.05, 'min_null_per': 0.5}\n",
"best_model_params = {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 250}\n",
"best_model_name = 'random_forest'"
],
"id": "60b676032873d47f",
"outputs": [],
"execution_count": 49
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Train model",
"id": "d80f652307c3c899"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Split Data",
"id": "f9282ded41cb1289"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T14:16:44.591809Z",
"start_time": "2024-09-18T14:16:44.567813Z"
}
},
"cell_type": "code",
"source": [
"train, test = train_test_split(df, test_size=0.2, random_state=42)\n",
"# train, test = train_test_split(df, test_size=0.2, shuffle=True)\n",
"train = df.copy()"
],
"id": "52e1e032203f5405",
"outputs": [],
"execution_count": 59
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## Preprocess Train data",
"id": "91547d20321be35e"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T14:16:52.162821Z",
"start_time": "2024-09-18T14:16:49.894327Z"
}
},
"cell_type": "code",
"source": [
"preprocessor = Preprocess(\n",
" dataframe=train,\n",
" missing_value_per=best_preprocess_params['missing_value_per'],\n",
" variance_threshold=best_preprocess_params['variance_threshold'],\n",
" min_null_per=best_preprocess_params['min_null_per']\n",
")\n",
"preprocessor.apply()\n",
"preprocessed_train = preprocessor.dataframe\n",
"\n",
"m = MissingValue(preprocessed_train)\n",
"preprocessed_train = m.fill_dataframe()"
],
"id": "ef81c209417a2c8",
"outputs": [],
"execution_count": 60
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Preprocess Test data",
"id": "3bd2f85b3e6c8885"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T14:09:40.036788Z",
"start_time": "2024-09-18T14:09:36.211438Z"
}
},
"cell_type": "code",
"source": [
"test = preprocessor._mapping(test)\n",
"m = MissingValue(test)\n",
"test = m.fill_dataframe()"
],
"id": "671c2557d34f07a6",
"outputs": [],
"execution_count": 52
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Scale Train Data",
"id": "3f09fd22d5182088"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T14:17:05.920291Z",
"start_time": "2024-09-18T14:17:05.880598Z"
}
},
"cell_type": "code",
"source": [
"sc = StandardScaler()\n",
"x_train = preprocessed_train.drop(['target label / yes no'], axis=1)\n",
"cols = x_train.columns\n",
"\n",
"x_train = sc.fit_transform(x_train)\n",
"y_train = preprocessed_train['target label / yes no'].values.astype('int')"
],
"id": "bee2fbd612008390",
"outputs": [],
"execution_count": 61
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Train",
"id": "4c80efea4c08fede"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T14:18:00.904530Z",
"start_time": "2024-09-18T14:17:59.809184Z"
}
},
"cell_type": "code",
"source": [
"rf = RandomForestClassifier(\n",
" bootstrap=best_model_params['bootstrap'],\n",
" max_depth=best_model_params['max_depth'],\n",
" min_samples_split=best_model_params['min_samples_split'],\n",
" min_samples_leaf=best_model_params['min_samples_leaf'],\n",
" n_estimators=best_model_params['n_estimators']\n",
")\n",
"rf.fit(x_train, y_train)"
],
"id": "f66b83d56aeb2aa6",
"outputs": [
{
"data": {
"text/plain": [
"RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=10,\n",
" n_estimators=250)"
],
"text/html": [
"<style>#sk-container-id-5 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-5 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-5 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-5 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-5 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-5 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-5 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-5 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-5 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-5 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-5 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-5 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-5 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-5 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-5 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-5 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-5 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-5 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-5 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-5 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-5 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-5 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-5 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-5\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=10,\n",
" n_estimators=250)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" checked><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> RandomForestClassifier<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestClassifier.html\">?<span>Documentation for RandomForestClassifier</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=10,\n",
" n_estimators=250)</pre></div> </div></div></div></div>"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 62
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Test Before Save",
"id": "41003b7e6f825"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T13:28:48.784607Z",
"start_time": "2024-09-18T13:28:48.762593Z"
}
},
"cell_type": "code",
"source": "test.shape",
"id": "de7dd264e5d3b81c",
"outputs": [
{
"data": {
"text/plain": [
"(385, 225)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 9
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T14:18:24.953162Z",
"start_time": "2024-09-18T14:18:24.876202Z"
}
},
"cell_type": "code",
"source": [
"x_test = test.drop(['target label / yes no'], axis=1)\n",
"x_test = x_test[cols]\n",
"# f = FeatureEngineering(x_test)\n",
"# x_test = f.kmeans(5).values\n",
"x_test = sc.transform(x_test)\n",
"y_test = test['target label / yes no'].values.astype('int')\n",
"\n",
"y_pred = rf.predict(x_test)\n",
"# y_pred = xg.predict(x_test)\n",
"\n",
"from sklearn.metrics import f1_score\n",
"f1 = f1_score(y_test, y_pred, average='weighted')"
],
"id": "7e018b4ce075af37",
"outputs": [
{
"ename": "ValueError",
"evalue": "could not convert string to float: 'ZERO'",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mValueError\u001B[0m Traceback (most recent call last)",
"\u001B[1;32m~\\AppData\\Local\\Temp\\ipykernel_27360\\853815528.py\u001B[0m in \u001B[0;36m?\u001B[1;34m()\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[0mx_test\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mtest\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdrop\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;34m'target label / yes no'\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0maxis\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;36m1\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 2\u001B[0m \u001B[0mx_test\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mx_test\u001B[0m\u001B[1;33m[\u001B[0m\u001B[0mcols\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 3\u001B[0m \u001B[1;31m# f = FeatureEngineering(x_test)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 4\u001B[0m \u001B[1;31m# x_test = f.kmeans(5).values\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 5\u001B[1;33m \u001B[0mx_test\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0msc\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mtransform\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mx_test\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 6\u001B[0m \u001B[0my_test\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mtest\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;34m'target label / yes no'\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mvalues\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mastype\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m'int'\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 7\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 8\u001B[0m \u001B[0my_pred\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mrf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mpredict\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mx_test\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32mC:\\ProgramData\\anaconda3\\envs\\AI\\lib\\site-packages\\sklearn\\utils\\_set_output.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, X, *args, **kwargs)\u001B[0m\n\u001B[0;32m 311\u001B[0m \u001B[1;33m@\u001B[0m\u001B[0mwraps\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mf\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 312\u001B[0m \u001B[1;32mdef\u001B[0m \u001B[0mwrapped\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m*\u001B[0m\u001B[0margs\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 313\u001B[1;33m \u001B[0mdata_to_wrap\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mf\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m*\u001B[0m\u001B[0margs\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 314\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0misinstance\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mdata_to_wrap\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mtuple\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 315\u001B[0m \u001B[1;31m# only wrap the first output for cross decomposition\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 316\u001B[0m return_tuple = (\n",
"\u001B[1;32mC:\\ProgramData\\anaconda3\\envs\\AI\\lib\\site-packages\\sklearn\\preprocessing\\_data.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, X, copy)\u001B[0m\n\u001B[0;32m 1041\u001B[0m \"\"\"\n\u001B[0;32m 1042\u001B[0m \u001B[0mcheck_is_fitted\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1043\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1044\u001B[0m \u001B[0mcopy\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mcopy\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mcopy\u001B[0m \u001B[1;32mis\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[1;32mNone\u001B[0m \u001B[1;32melse\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mcopy\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 1045\u001B[1;33m X = self._validate_data(\n\u001B[0m\u001B[0;32m 1046\u001B[0m \u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1047\u001B[0m \u001B[0mreset\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;32mFalse\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1048\u001B[0m \u001B[0maccept_sparse\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m\"csr\"\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32mC:\\ProgramData\\anaconda3\\envs\\AI\\lib\\site-packages\\sklearn\\base.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001B[0m\n\u001B[0;32m 629\u001B[0m \u001B[0mout\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0my\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 630\u001B[0m \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 631\u001B[0m \u001B[0mout\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0my\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 632\u001B[0m \u001B[1;32melif\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[0mno_val_X\u001B[0m \u001B[1;32mand\u001B[0m \u001B[0mno_val_y\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 633\u001B[1;33m \u001B[0mout\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mcheck_array\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0minput_name\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m\"X\"\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mcheck_params\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 634\u001B[0m \u001B[1;32melif\u001B[0m \u001B[0mno_val_X\u001B[0m \u001B[1;32mand\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[0mno_val_y\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 635\u001B[0m \u001B[0mout\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0m_check_y\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0my\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mcheck_params\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 636\u001B[0m \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32mC:\\ProgramData\\anaconda3\\envs\\AI\\lib\\site-packages\\sklearn\\utils\\validation.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001B[0m\n\u001B[0;32m 1009\u001B[0m )\n\u001B[0;32m 1010\u001B[0m \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mastype\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mcopy\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;32mFalse\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1011\u001B[0m \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1012\u001B[0m \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0m_asarray_with_order\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0morder\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0morder\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mxp\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 1013\u001B[1;33m \u001B[1;32mexcept\u001B[0m \u001B[0mComplexWarning\u001B[0m \u001B[1;32mas\u001B[0m \u001B[0mcomplex_warning\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 1014\u001B[0m raise ValueError(\n\u001B[0;32m 1015\u001B[0m \u001B[1;34m\"Complex data not supported\\n{}\\n\"\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mformat\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1016\u001B[0m ) from complex_warning\n",
"\u001B[1;32mC:\\ProgramData\\anaconda3\\envs\\AI\\lib\\site-packages\\sklearn\\utils\\_array_api.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(array, dtype, order, copy, xp, device)\u001B[0m\n\u001B[0;32m 747\u001B[0m \u001B[1;31m# Use NumPy API to support order\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 748\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mcopy\u001B[0m \u001B[1;32mis\u001B[0m \u001B[1;32mTrue\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 749\u001B[0m \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mnumpy\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0morder\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0morder\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 750\u001B[0m \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 751\u001B[1;33m \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mnumpy\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0masarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0morder\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0morder\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 752\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 753\u001B[0m \u001B[1;31m# At this point array is a NumPy ndarray. We convert it to an array\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 754\u001B[0m \u001B[1;31m# container that is consistent with the input's namespace.\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32mC:\\ProgramData\\anaconda3\\envs\\AI\\lib\\site-packages\\pandas\\core\\generic.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, dtype, copy)\u001B[0m\n\u001B[0;32m 2149\u001B[0m def __array__(\n\u001B[0;32m 2150\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m:\u001B[0m \u001B[0mnpt\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mDTypeLike\u001B[0m \u001B[1;33m|\u001B[0m \u001B[1;32mNone\u001B[0m \u001B[1;33m=\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mcopy\u001B[0m\u001B[1;33m:\u001B[0m \u001B[0mbool_t\u001B[0m \u001B[1;33m|\u001B[0m \u001B[1;32mNone\u001B[0m \u001B[1;33m=\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 2151\u001B[0m ) -> np.ndarray:\n\u001B[0;32m 2152\u001B[0m \u001B[0mvalues\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_values\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 2153\u001B[1;33m \u001B[0marr\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mnp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0masarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mvalues\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 2154\u001B[0m if (\n\u001B[0;32m 2155\u001B[0m \u001B[0mastype_is_view\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mvalues\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0marr\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 2156\u001B[0m \u001B[1;32mand\u001B[0m \u001B[0musing_copy_on_write\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;31mValueError\u001B[0m: could not convert string to float: 'ZERO'"
]
}
],
"execution_count": 63
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T14:09:48.970481Z",
"start_time": "2024-09-18T14:09:48.956483Z"
}
},
"cell_type": "code",
"source": "print(f1)",
"id": "ccd06ef8b06c98a1",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.8371721052057187\n"
]
}
],
"execution_count": 56
},
{
"metadata": {},
"cell_type": "markdown",
"source": "### Save Scaler and Cols",
"id": "489313be13e66c00"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T14:18:30.797406Z",
"start_time": "2024-09-18T14:18:30.769400Z"
}
},
"cell_type": "code",
"source": [
"with open(f'model/columns.pkl', 'wb') as f:\n",
" pickle.dump(cols.to_list(), f)\n",
"\n",
"with open(f'model/scaler.pkl', 'wb') as f:\n",
" pickle.dump(sc, f)\n",
"\n",
"with open(f'model/model.pkl', 'wb') as f:\n",
" pickle.dump(rf, f)"
],
"id": "4b257b89edd730ba",
"outputs": [],
"execution_count": 64
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Loop",
"id": "4bb21a9641b6823"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T13:58:15.066193Z",
"start_time": "2024-09-18T13:57:49.137112Z"
}
},
"cell_type": "code",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"0.8465739750445631\n",
"--------------------\n",
"1\n",
"0.8587816632306016\n",
"--------------------\n",
"2\n",
"0.8692866312086448\n",
"--------------------\n",
"3\n",
"0.8277746239161031\n",
"--------------------\n",
"4\n",
"0.8769936338171633\n",
"--------------------\n"
]
}
],
"execution_count": 43,
"source": [
"best_preprocess_params = {'missing_value_per': 0.4, 'variance_threshold': 0.05, 'min_null_per': 0.5}\n",
"best_model_params = {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 250}\n",
"best_model_name = 'random_forest'\n",
"\n",
"f1_list = []\n",
"for i in range(5):\n",
" print(i)\n",
" i+=1\n",
" train, test = train_test_split(df, test_size=0.2, shuffle=True)\n",
" \n",
" preprocessor = Preprocess(\n",
" dataframe=train,\n",
" missing_value_per=best_preprocess_params['missing_value_per'],\n",
" variance_threshold=best_preprocess_params['variance_threshold'],\n",
" min_null_per=best_preprocess_params['min_null_per']\n",
" )\n",
" preprocessor.apply()\n",
" preprocessed_train = preprocessor.dataframe\n",
" \n",
" m = MissingValue(preprocessed_train)\n",
" preprocessed_train = m.fill_dataframe()\n",
" \n",
" test = preprocessor._mapping(test)\n",
" m = MissingValue(test)\n",
" test = m.fill_dataframe()\n",
" \n",
" sc = StandardScaler()\n",
" x_train = preprocessed_train.drop(['target label / yes no'], axis=1)\n",
" cols = x_train.columns\n",
" \n",
" x_train = sc.fit_transform(x_train)\n",
" y_train = preprocessed_train['target label / yes no'].values.astype('int')\n",
" \n",
" xg = XGBClassifier(\n",
" subsample=1.0,\n",
" colsample_bytree=0.8,\n",
" learning_rate=0.1,\n",
" max_depth=3,\n",
" n_estimators=100\n",
" \n",
" )\n",
" xg.fit(x_train, y_train)\n",
" # rf = RandomForestClassifier(\n",
" # bootstrap=best_model_params['bootstrap'],\n",
" # max_depth=best_model_params['max_depth'],\n",
" # min_samples_split=best_model_params['min_samples_split'],\n",
" # min_samples_leaf=best_model_params['min_samples_leaf'],\n",
" # n_estimators=best_model_params['n_estimators']\n",
" # )\n",
" # rf.fit(x_train, y_train)\n",
" \n",
" x_test = test.drop(['target label / yes no'], axis=1)\n",
" x_test = x_test[cols]\n",
" # f = FeatureEngineering(x_test)\n",
" # x_test = f.kmeans(5).values\n",
" x_test = sc.transform(x_test)\n",
" y_test = test['target label / yes no'].values.astype('int')\n",
" \n",
" # y_pred = rf.predict(x_test)\n",
" y_pred = xg.predict(x_test)\n",
" \n",
" f1 = f1_score(y_test, y_pred, average='weighted')\n",
" print(f1)\n",
" print('-'*20)\n",
" f1_list.append(f1)"
],
"id": "2541c2502467f414"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T13:58:15.931329Z",
"start_time": "2024-09-18T13:58:15.915334Z"
}
},
"cell_type": "code",
"outputs": [
{
"data": {
"text/plain": [
"0.8558821054434151"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 44,
"source": "sum(f1_list)/len(f1_list)",
"id": "e999e391b7942a67"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T13:56:24.034247Z",
"start_time": "2024-09-18T13:56:24.022243Z"
}
},
"cell_type": "code",
"outputs": [
{
"data": {
"text/plain": [
"[0.7916205533596838,\n",
" 0.843782847316944,\n",
" 0.8567467859318522,\n",
" 0.8484060247934703,\n",
" 0.841208207987869]"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 42,
"source": "f1_list",
"id": "7a4e0d32bc1dd26f"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T15:41:32.938129Z",
"start_time": "2024-09-18T15:41:32.906823Z"
}
},
"cell_type": "code",
"source": [
"import pickle \n",
"with open(f'model/columns.pkl', 'rb') as f:\n",
" cols = pickle.load(f)\n",
"\n",
"with open(f'model/scaler.pkl', 'rb') as f:\n",
" sc = pickle.load(f)\n",
"\n",
"with open(f'model/model.pkl', 'rb') as f:\n",
" model = pickle.load(f)"
],
"id": "6b4773d45b76890e",
"outputs": [],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T15:41:15.600339Z",
"start_time": "2024-09-18T15:41:09.822041Z"
}
},
"cell_type": "code",
"source": [
"import pandas as pd \n",
"df = pd.read_excel('dataset.xlsx',engine='openpyxl')\n",
"df.drop(['Unnamed: 0', 'visit id'], axis=1, inplace=True)\n"
],
"id": "89de2953628be5e3",
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T15:41:36.959991Z",
"start_time": "2024-09-18T15:41:36.879965Z"
}
},
"cell_type": "code",
"source": [
"from utils import Preprocess\n",
"pre = Preprocess(df, missing_value_per=0.4, variance_threshold=0.05, min_null_per=0.5)\n",
"df = pre._mapping(df)\n",
"cols.append('target label / yes no')\n",
"df = df[cols]\n",
"X = df.drop(['target label / yes no'], axis=1)\n",
"X = sc.transform(X)\n",
"y = df['target label / yes no'].fillna(0).to_numpy()\n",
"y_pre = model.predict(X)"
],
"id": "3e387344a200e132",
"outputs": [],
"execution_count": 5
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T15:41:39.115344Z",
"start_time": "2024-09-18T15:41:39.104343Z"
}
},
"cell_type": "code",
"source": "y",
"id": "11c53c9d426b25ac",
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, ..., 0, 0, 0], dtype=object)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 6
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-18T15:41:54.021876Z",
"start_time": "2024-09-18T15:41:53.999870Z"
}
},
"cell_type": "code",
"source": [
"from sklearn.metrics import f1_score\n",
"\n",
"f1_score(y.astype(int), y_pre.astype(int), average='weighted')"
],
"id": "78c1b6bcf860b84f",
"outputs": [
{
"data": {
"text/plain": [
"0.9027694811920448"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 7
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}