[59083a]: / preprocess.ipynb

Download this file

1265 lines (1264 with data), 55.3 kB

{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Load Data",
   "id": "6d1b372bd242a6f7"
  },
  {
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-09-18T10:13:09.095287Z",
     "start_time": "2024-09-18T10:13:07.212667Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import pandas as pd\n",
    "from xgboost import XGBClassifier\n",
    "\n",
    "df = pd.read_excel('dataset.xlsx',engine='openpyxl')\n",
    "df.drop(['Unnamed: 0', 'visit id'], axis=1, inplace=True)\n",
    "from sklearn.model_selection import train_test_split\n",
    "# X = df.drop(['target label / yes no'],axis=1)\n",
    "# y = df['target label / yes no']\n",
    "# df_train, df_test= train_test_split(df, test_size=0.2,shuffle=True, random_state=42)"
   ],
   "id": "initial_id",
   "outputs": [],
   "execution_count": 2
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "# Grid Search\n",
    "Search for best paramters and log them in mlflow"
   ],
   "id": "7ba2217da72ca9c1"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T11:29:19.406874Z",
     "start_time": "2024-09-18T11:14:43.860088Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from utils import Preprocess, MissingValue\n",
    "\n",
    "from utils import MLModelSelector\n",
    "\n",
    "\n",
    "preprocess_param_grid  = {\n",
    "    'missing_value_per': [0.1, 0.2, 0.3, 0.4, 0.5],\n",
    "    'variance_threshold': [0.0, 0.01, 0.05, 0.1, 0.2],\n",
    "    'min_null_per': [0.5]\n",
    "}\n",
    "\n",
    "param_grids = {\n",
    "    # 'SVM': {\n",
    "    #     'C': [0.1, 1, 10],         \n",
    "    #     'kernel': ['linear', 'rbf'] \n",
    "    # },\n",
    "    # 'logistic_regression': {\n",
    "    #     'C': [0.1, 1, 10],         \n",
    "    #     'solver': ['lbfgs'],        \n",
    "    #     'max_iter': [500, 1000, 2000] \n",
    "    # },\n",
    "    'random_forest': {\n",
    "    'n_estimators': [150, 250],  # Number of trees in the forest\n",
    "    'max_depth': [10, 20],  # Maximum depth of the tree\n",
    "    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node\n",
    "    'min_samples_leaf': [1, 2],    # Minimum number of samples required to be at a leaf node\n",
    "    'bootstrap': [True],       # Whether bootstrap samples are used when building trees\n",
    "},\n",
    "    'XGBoost': {\n",
    "    'n_estimators': [100, 200],      # Number of boosting rounds\n",
    "    'max_depth': [3, 6],              # Maximum depth of a tree\n",
    "    'learning_rate': [0.1, 0.2],    # Step size shrinkage used in update to prevent overfitting\n",
    "    'subsample': [0.6, 1.0],         # Fraction of samples used for training each tree\n",
    "    'colsample_bytree': [0.8, 1.0],  # Fraction of features used at each split\n",
    "}\n",
    "}\n",
    "\n",
    "best_preprocess_params = None\n",
    "best_model_params = None\n",
    "best_overall_score = -1\n",
    "best_model_name = None\n",
    "i = 0\n",
    "for missing_value_per in preprocess_param_grid['missing_value_per']:\n",
    "    for variance_threshold in preprocess_param_grid['variance_threshold']:\n",
    "        for min_null_per in preprocess_param_grid['min_null_per']:\n",
    "            df_train, df_test= train_test_split(df, test_size=0.2,shuffle=True)\n",
    "            preprocess_params = {\n",
    "                'missing_value_per': missing_value_per,\n",
    "                'variance_threshold': variance_threshold,\n",
    "                'min_null_per': min_null_per\n",
    "            }\n",
    "            pre_data = Preprocess(\n",
    "                dataframe=df_train, \n",
    "                missing_value_per=missing_value_per, \n",
    "                variance_threshold=variance_threshold, \n",
    "                min_null_per=min_null_per\n",
    "            )\n",
    "            pre_data.apply()\n",
    "            df_train = pre_data.dataframe\n",
    "            missing = MissingValue(original_df=df_train,test_size=0.1)\n",
    "            df_train = missing.fill_dataframe()\n",
    "            cols = df_train.columns\n",
    "            X_train = df_train.drop(['target label / yes no'], axis=1)\n",
    "            y_train = df_train['target label / yes no']\n",
    "            \n",
    "            df_test = df_test[cols]\n",
    " \n",
    "            df_test = pre_data._mapping(df_test)\n",
    "            missing = MissingValue(original_df=df_test,test_size=0.1)\n",
    "            df_test = missing.fill_dataframe()\n",
    "            X_test = df_test.drop(['target label / yes no'], axis=1)\n",
    "            y_test = df_test['target label / yes no'].values.astype(int)\n",
    "            \n",
    "            for model_name, param_grid in param_grids.items():\n",
    "                model_selector = MLModelSelector()\n",
    "                best_params, best_score = model_selector.train_model(\n",
    "                    X_train, y_train,X_test,y_test, \n",
    "                    model_name, \n",
    "                    param_grid, \n",
    "                    preprocess_parameters=preprocess_params  \n",
    "                )\n",
    "                \n",
    "                if best_score > best_overall_score:\n",
    "                    best_overall_score = best_score\n",
    "                    best_preprocess_params = preprocess_params\n",
    "                    best_model_params = best_params\n",
    "                    best_model_name = model_name\n",
    "                   \n",
    "            i = i + 1\n",
    "            print(i)\n",
    "            print(best_overall_score)\n",
    "\n",
    "print(f\"Best Preprocess Params: {best_preprocess_params}\")\n",
    "print(f\"Best Model Params: {best_model_params}\")\n",
    "print(f\"Best Overall Score: {best_overall_score}\")\n"
   ],
   "id": "4228dc0652080dc5",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n",
      "0.7454545454545455\n",
      "2\n",
      "0.7454545454545455\n",
      "3\n",
      "0.7484030554078361\n",
      "4\n",
      "0.7546958304853042\n",
      "5\n",
      "0.7546958304853042\n",
      "6\n",
      "0.7562392881887466\n",
      "7\n",
      "0.7562392881887466\n",
      "8\n",
      "0.7562392881887466\n",
      "9\n",
      "0.7562392881887466\n",
      "10\n",
      "0.7562392881887466\n",
      "11\n",
      "0.7562392881887466\n",
      "12\n",
      "0.7562392881887466\n",
      "13\n",
      "0.7562392881887466\n",
      "14\n",
      "0.7562392881887466\n",
      "15\n",
      "0.7562392881887466\n",
      "16\n",
      "0.8621799805212067\n",
      "17\n",
      "0.8621799805212067\n",
      "18\n",
      "0.8853833897195243\n",
      "19\n",
      "0.8853833897195243\n",
      "20\n",
      "0.8853833897195243\n",
      "21\n",
      "0.8853833897195243\n",
      "22\n",
      "0.8853833897195243\n",
      "23\n",
      "0.8853833897195243\n",
      "24\n",
      "0.8853833897195243\n",
      "25\n",
      "0.8853833897195243\n",
      "Best Preprocess Params: {'missing_value_per': 0.4, 'variance_threshold': 0.05, 'min_null_per': 0.5}\n",
      "Best Model Params: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 250}\n",
      "Best Overall Score: 0.8853833897195243\n"
     ]
    }
   ],
   "execution_count": 16
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "# Train Best Model\n",
    "Use the best parameter and save the model and other important elements"
   ],
   "id": "8dd1bb70b6cb71e9"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T14:09:22.818872Z",
     "start_time": "2024-09-18T14:09:22.798869Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from utils import Preprocess, MissingValue\n",
    "import pickle\n",
    "from sklearn.metrics import f1_score\n",
    "from xgboost import XGBClassifier"
   ],
   "id": "43b0c8f8f2c807ea",
   "outputs": [],
   "execution_count": 47
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Load Data",
   "id": "c08e3333ceadab75"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T14:16:42.761730Z",
     "start_time": "2024-09-18T14:16:37.848559Z"
    }
   },
   "cell_type": "code",
   "source": [
    "df = pd.read_excel('dataset.xlsx',engine='openpyxl')\n",
    "df.drop(['Unnamed: 0', 'visit id'], axis=1, inplace=True)"
   ],
   "id": "58c520662874e959",
   "outputs": [],
   "execution_count": 58
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Load parameters ",
   "id": "ddaeeb3e1d01160c"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T14:09:28.840294Z",
     "start_time": "2024-09-18T14:09:28.830692Z"
    }
   },
   "cell_type": "code",
   "source": [
    "best_preprocess_params =  {'missing_value_per': 0.4, 'variance_threshold': 0.05, 'min_null_per': 0.5}\n",
    "best_model_params = {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 250}\n",
    "best_model_name = 'random_forest'"
   ],
   "id": "60b676032873d47f",
   "outputs": [],
   "execution_count": 49
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Train model",
   "id": "d80f652307c3c899"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### Split Data",
   "id": "f9282ded41cb1289"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T14:16:44.591809Z",
     "start_time": "2024-09-18T14:16:44.567813Z"
    }
   },
   "cell_type": "code",
   "source": [
    "train, test = train_test_split(df, test_size=0.2, random_state=42)\n",
    "# train, test = train_test_split(df, test_size=0.2, shuffle=True)\n",
    "train = df.copy()"
   ],
   "id": "52e1e032203f5405",
   "outputs": [],
   "execution_count": 59
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Preprocess Train data",
   "id": "91547d20321be35e"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T14:16:52.162821Z",
     "start_time": "2024-09-18T14:16:49.894327Z"
    }
   },
   "cell_type": "code",
   "source": [
    "preprocessor = Preprocess(\n",
    "    dataframe=train,\n",
    "    missing_value_per=best_preprocess_params['missing_value_per'],\n",
    "    variance_threshold=best_preprocess_params['variance_threshold'],\n",
    "    min_null_per=best_preprocess_params['min_null_per']\n",
    ")\n",
    "preprocessor.apply()\n",
    "preprocessed_train = preprocessor.dataframe\n",
    "\n",
    "m = MissingValue(preprocessed_train)\n",
    "preprocessed_train = m.fill_dataframe()"
   ],
   "id": "ef81c209417a2c8",
   "outputs": [],
   "execution_count": 60
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### Preprocess Test data",
   "id": "3bd2f85b3e6c8885"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T14:09:40.036788Z",
     "start_time": "2024-09-18T14:09:36.211438Z"
    }
   },
   "cell_type": "code",
   "source": [
    "test = preprocessor._mapping(test)\n",
    "m = MissingValue(test)\n",
    "test = m.fill_dataframe()"
   ],
   "id": "671c2557d34f07a6",
   "outputs": [],
   "execution_count": 52
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### Scale Train Data",
   "id": "3f09fd22d5182088"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T14:17:05.920291Z",
     "start_time": "2024-09-18T14:17:05.880598Z"
    }
   },
   "cell_type": "code",
   "source": [
    "sc = StandardScaler()\n",
    "x_train = preprocessed_train.drop(['target label / yes no'], axis=1)\n",
    "cols = x_train.columns\n",
    "\n",
    "x_train = sc.fit_transform(x_train)\n",
    "y_train = preprocessed_train['target label / yes no'].values.astype('int')"
   ],
   "id": "bee2fbd612008390",
   "outputs": [],
   "execution_count": 61
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### Train",
   "id": "4c80efea4c08fede"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T14:18:00.904530Z",
     "start_time": "2024-09-18T14:17:59.809184Z"
    }
   },
   "cell_type": "code",
   "source": [
    "rf = RandomForestClassifier(\n",
    "    bootstrap=best_model_params['bootstrap'],\n",
    "    max_depth=best_model_params['max_depth'],\n",
    "    min_samples_split=best_model_params['min_samples_split'],\n",
    "    min_samples_leaf=best_model_params['min_samples_leaf'],\n",
    "    n_estimators=best_model_params['n_estimators']\n",
    ")\n",
    "rf.fit(x_train, y_train)"
   ],
   "id": "f66b83d56aeb2aa6",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=10,\n",
       "                       n_estimators=250)"
      ],
      "text/html": [
       "<style>#sk-container-id-5 {\n",
       "  /* Definition of color scheme common for light and dark mode */\n",
       "  --sklearn-color-text: black;\n",
       "  --sklearn-color-line: gray;\n",
       "  /* Definition of color scheme for unfitted estimators */\n",
       "  --sklearn-color-unfitted-level-0: #fff5e6;\n",
       "  --sklearn-color-unfitted-level-1: #f6e4d2;\n",
       "  --sklearn-color-unfitted-level-2: #ffe0b3;\n",
       "  --sklearn-color-unfitted-level-3: chocolate;\n",
       "  /* Definition of color scheme for fitted estimators */\n",
       "  --sklearn-color-fitted-level-0: #f0f8ff;\n",
       "  --sklearn-color-fitted-level-1: #d4ebff;\n",
       "  --sklearn-color-fitted-level-2: #b3dbfd;\n",
       "  --sklearn-color-fitted-level-3: cornflowerblue;\n",
       "\n",
       "  /* Specific color for light theme */\n",
       "  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
       "  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
       "  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
       "  --sklearn-color-icon: #696969;\n",
       "\n",
       "  @media (prefers-color-scheme: dark) {\n",
       "    /* Redefinition of color scheme for dark theme */\n",
       "    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
       "    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
       "    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
       "    --sklearn-color-icon: #878787;\n",
       "  }\n",
       "}\n",
       "\n",
       "#sk-container-id-5 {\n",
       "  color: var(--sklearn-color-text);\n",
       "}\n",
       "\n",
       "#sk-container-id-5 pre {\n",
       "  padding: 0;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 input.sk-hidden--visually {\n",
       "  border: 0;\n",
       "  clip: rect(1px 1px 1px 1px);\n",
       "  clip: rect(1px, 1px, 1px, 1px);\n",
       "  height: 1px;\n",
       "  margin: -1px;\n",
       "  overflow: hidden;\n",
       "  padding: 0;\n",
       "  position: absolute;\n",
       "  width: 1px;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-dashed-wrapped {\n",
       "  border: 1px dashed var(--sklearn-color-line);\n",
       "  margin: 0 0.4em 0.5em 0.4em;\n",
       "  box-sizing: border-box;\n",
       "  padding-bottom: 0.4em;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-container {\n",
       "  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
       "     but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
       "     so we also need the `!important` here to be able to override the\n",
       "     default hidden behavior on the sphinx rendered scikit-learn.org.\n",
       "     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
       "  display: inline-block !important;\n",
       "  position: relative;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-text-repr-fallback {\n",
       "  display: none;\n",
       "}\n",
       "\n",
       "div.sk-parallel-item,\n",
       "div.sk-serial,\n",
       "div.sk-item {\n",
       "  /* draw centered vertical line to link estimators */\n",
       "  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
       "  background-size: 2px 100%;\n",
       "  background-repeat: no-repeat;\n",
       "  background-position: center center;\n",
       "}\n",
       "\n",
       "/* Parallel-specific style estimator block */\n",
       "\n",
       "#sk-container-id-5 div.sk-parallel-item::after {\n",
       "  content: \"\";\n",
       "  width: 100%;\n",
       "  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
       "  flex-grow: 1;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-parallel {\n",
       "  display: flex;\n",
       "  align-items: stretch;\n",
       "  justify-content: center;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  position: relative;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-parallel-item {\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-parallel-item:first-child::after {\n",
       "  align-self: flex-end;\n",
       "  width: 50%;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-parallel-item:last-child::after {\n",
       "  align-self: flex-start;\n",
       "  width: 50%;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-parallel-item:only-child::after {\n",
       "  width: 0;\n",
       "}\n",
       "\n",
       "/* Serial-specific style estimator block */\n",
       "\n",
       "#sk-container-id-5 div.sk-serial {\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "  align-items: center;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  padding-right: 1em;\n",
       "  padding-left: 1em;\n",
       "}\n",
       "\n",
       "\n",
       "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
       "clickable and can be expanded/collapsed.\n",
       "- Pipeline and ColumnTransformer use this feature and define the default style\n",
       "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
       "*/\n",
       "\n",
       "/* Pipeline and ColumnTransformer style (default) */\n",
       "\n",
       "#sk-container-id-5 div.sk-toggleable {\n",
       "  /* Default theme specific background. It is overwritten whether we have a\n",
       "  specific estimator or a Pipeline/ColumnTransformer */\n",
       "  background-color: var(--sklearn-color-background);\n",
       "}\n",
       "\n",
       "/* Toggleable label */\n",
       "#sk-container-id-5 label.sk-toggleable__label {\n",
       "  cursor: pointer;\n",
       "  display: block;\n",
       "  width: 100%;\n",
       "  margin-bottom: 0;\n",
       "  padding: 0.5em;\n",
       "  box-sizing: border-box;\n",
       "  text-align: center;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 label.sk-toggleable__label-arrow:before {\n",
       "  /* Arrow on the left of the label */\n",
       "  content: \"▸\";\n",
       "  float: left;\n",
       "  margin-right: 0.25em;\n",
       "  color: var(--sklearn-color-icon);\n",
       "}\n",
       "\n",
       "#sk-container-id-5 label.sk-toggleable__label-arrow:hover:before {\n",
       "  color: var(--sklearn-color-text);\n",
       "}\n",
       "\n",
       "/* Toggleable content - dropdown */\n",
       "\n",
       "#sk-container-id-5 div.sk-toggleable__content {\n",
       "  max-height: 0;\n",
       "  max-width: 0;\n",
       "  overflow: hidden;\n",
       "  text-align: left;\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-toggleable__content.fitted {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-toggleable__content pre {\n",
       "  margin: 0.2em;\n",
       "  border-radius: 0.25em;\n",
       "  color: var(--sklearn-color-text);\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-toggleable__content.fitted pre {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-5 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
       "  /* Expand drop-down */\n",
       "  max-height: 200px;\n",
       "  max-width: 100%;\n",
       "  overflow: auto;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
       "  content: \"▾\";\n",
       "}\n",
       "\n",
       "/* Pipeline/ColumnTransformer-specific style */\n",
       "\n",
       "#sk-container-id-5 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Estimator-specific style */\n",
       "\n",
       "/* Colorize estimator box */\n",
       "#sk-container-id-5 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-label label.sk-toggleable__label,\n",
       "#sk-container-id-5 div.sk-label label {\n",
       "  /* The background is the default theme color */\n",
       "  color: var(--sklearn-color-text-on-default-background);\n",
       "}\n",
       "\n",
       "/* On hover, darken the color of the background */\n",
       "#sk-container-id-5 div.sk-label:hover label.sk-toggleable__label {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "/* Label box, darken color on hover, fitted */\n",
       "#sk-container-id-5 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Estimator label */\n",
       "\n",
       "#sk-container-id-5 div.sk-label label {\n",
       "  font-family: monospace;\n",
       "  font-weight: bold;\n",
       "  display: inline-block;\n",
       "  line-height: 1.2em;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-label-container {\n",
       "  text-align: center;\n",
       "}\n",
       "\n",
       "/* Estimator-specific */\n",
       "#sk-container-id-5 div.sk-estimator {\n",
       "  font-family: monospace;\n",
       "  border: 1px dotted var(--sklearn-color-border-box);\n",
       "  border-radius: 0.25em;\n",
       "  box-sizing: border-box;\n",
       "  margin-bottom: 0.5em;\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-estimator.fitted {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "/* on hover */\n",
       "#sk-container-id-5 div.sk-estimator:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-5 div.sk-estimator.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
       "\n",
       "/* Common style for \"i\" and \"?\" */\n",
       "\n",
       ".sk-estimator-doc-link,\n",
       "a:link.sk-estimator-doc-link,\n",
       "a:visited.sk-estimator-doc-link {\n",
       "  float: right;\n",
       "  font-size: smaller;\n",
       "  line-height: 1em;\n",
       "  font-family: monospace;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  border-radius: 1em;\n",
       "  height: 1em;\n",
       "  width: 1em;\n",
       "  text-decoration: none !important;\n",
       "  margin-left: 1ex;\n",
       "  /* unfitted */\n",
       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-unfitted-level-1);\n",
       "}\n",
       "\n",
       ".sk-estimator-doc-link.fitted,\n",
       "a:link.sk-estimator-doc-link.fitted,\n",
       "a:visited.sk-estimator-doc-link.fitted {\n",
       "  /* fitted */\n",
       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-fitted-level-1);\n",
       "}\n",
       "\n",
       "/* On hover */\n",
       "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
       ".sk-estimator-doc-link:hover,\n",
       "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
       ".sk-estimator-doc-link:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
       "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
       ".sk-estimator-doc-link.fitted:hover,\n",
       "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
       ".sk-estimator-doc-link.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
       "/* Span, style for the box shown on hovering the info icon */\n",
       ".sk-estimator-doc-link span {\n",
       "  display: none;\n",
       "  z-index: 9999;\n",
       "  position: relative;\n",
       "  font-weight: normal;\n",
       "  right: .2ex;\n",
       "  padding: .5ex;\n",
       "  margin: .5ex;\n",
       "  width: min-content;\n",
       "  min-width: 20ex;\n",
       "  max-width: 50ex;\n",
       "  color: var(--sklearn-color-text);\n",
       "  box-shadow: 2pt 2pt 4pt #999;\n",
       "  /* unfitted */\n",
       "  background: var(--sklearn-color-unfitted-level-0);\n",
       "  border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
       "}\n",
       "\n",
       ".sk-estimator-doc-link.fitted span {\n",
       "  /* fitted */\n",
       "  background: var(--sklearn-color-fitted-level-0);\n",
       "  border: var(--sklearn-color-fitted-level-3);\n",
       "}\n",
       "\n",
       ".sk-estimator-doc-link:hover span {\n",
       "  display: block;\n",
       "}\n",
       "\n",
       "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
       "\n",
       "#sk-container-id-5 a.estimator_doc_link {\n",
       "  float: right;\n",
       "  font-size: 1rem;\n",
       "  line-height: 1em;\n",
       "  font-family: monospace;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  border-radius: 1rem;\n",
       "  height: 1rem;\n",
       "  width: 1rem;\n",
       "  text-decoration: none;\n",
       "  /* unfitted */\n",
       "  color: var(--sklearn-color-unfitted-level-1);\n",
       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 a.estimator_doc_link.fitted {\n",
       "  /* fitted */\n",
       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-fitted-level-1);\n",
       "}\n",
       "\n",
       "/* On hover */\n",
       "#sk-container-id-5 a.estimator_doc_link:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
       "#sk-container-id-5 a.estimator_doc_link.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-3);\n",
       "}\n",
       "</style><div id=\"sk-container-id-5\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=10,\n",
       "                       n_estimators=250)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" checked><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;RandomForestClassifier<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestClassifier.html\">?<span>Documentation for RandomForestClassifier</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=10,\n",
       "                       n_estimators=250)</pre></div> </div></div></div></div>"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 62
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### Test Before Save",
   "id": "41003b7e6f825"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T13:28:48.784607Z",
     "start_time": "2024-09-18T13:28:48.762593Z"
    }
   },
   "cell_type": "code",
   "source": "test.shape",
   "id": "de7dd264e5d3b81c",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(385, 225)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 9
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T14:18:24.953162Z",
     "start_time": "2024-09-18T14:18:24.876202Z"
    }
   },
   "cell_type": "code",
   "source": [
    "x_test = test.drop(['target label / yes no'], axis=1)\n",
    "x_test = x_test[cols]\n",
    "# f = FeatureEngineering(x_test)\n",
    "# x_test = f.kmeans(5).values\n",
    "x_test = sc.transform(x_test)\n",
    "y_test = test['target label / yes no'].values.astype('int')\n",
    "\n",
    "y_pred = rf.predict(x_test)\n",
    "# y_pred = xg.predict(x_test)\n",
    "\n",
    "from sklearn.metrics import f1_score\n",
    "f1 = f1_score(y_test, y_pred, average='weighted')"
   ],
   "id": "7e018b4ce075af37",
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "could not convert string to float: 'ZERO'",
     "output_type": "error",
     "traceback": [
      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[1;31mValueError\u001B[0m                                Traceback (most recent call last)",
      "\u001B[1;32m~\\AppData\\Local\\Temp\\ipykernel_27360\\853815528.py\u001B[0m in \u001B[0;36m?\u001B[1;34m()\u001B[0m\n\u001B[0;32m      1\u001B[0m \u001B[0mx_test\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mtest\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdrop\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;34m'target label / yes no'\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0maxis\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;36m1\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m      2\u001B[0m \u001B[0mx_test\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mx_test\u001B[0m\u001B[1;33m[\u001B[0m\u001B[0mcols\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m      3\u001B[0m \u001B[1;31m# f = FeatureEngineering(x_test)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m      4\u001B[0m \u001B[1;31m# x_test = f.kmeans(5).values\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 5\u001B[1;33m \u001B[0mx_test\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0msc\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mtransform\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mx_test\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m      6\u001B[0m \u001B[0my_test\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mtest\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;34m'target label / yes no'\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mvalues\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mastype\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m'int'\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m      7\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m      8\u001B[0m \u001B[0my_pred\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mrf\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mpredict\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mx_test\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mC:\\ProgramData\\anaconda3\\envs\\AI\\lib\\site-packages\\sklearn\\utils\\_set_output.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, X, *args, **kwargs)\u001B[0m\n\u001B[0;32m    311\u001B[0m     \u001B[1;33m@\u001B[0m\u001B[0mwraps\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mf\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    312\u001B[0m     \u001B[1;32mdef\u001B[0m \u001B[0mwrapped\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m*\u001B[0m\u001B[0margs\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 313\u001B[1;33m         \u001B[0mdata_to_wrap\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mf\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m*\u001B[0m\u001B[0margs\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m    314\u001B[0m         \u001B[1;32mif\u001B[0m \u001B[0misinstance\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mdata_to_wrap\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mtuple\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    315\u001B[0m             \u001B[1;31m# only wrap the first output for cross decomposition\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    316\u001B[0m             return_tuple = (\n",
      "\u001B[1;32mC:\\ProgramData\\anaconda3\\envs\\AI\\lib\\site-packages\\sklearn\\preprocessing\\_data.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, X, copy)\u001B[0m\n\u001B[0;32m   1041\u001B[0m         \"\"\"\n\u001B[0;32m   1042\u001B[0m         \u001B[0mcheck_is_fitted\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1043\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1044\u001B[0m         \u001B[0mcopy\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mcopy\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mcopy\u001B[0m \u001B[1;32mis\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[1;32mNone\u001B[0m \u001B[1;32melse\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mcopy\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 1045\u001B[1;33m         X = self._validate_data(\n\u001B[0m\u001B[0;32m   1046\u001B[0m             \u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1047\u001B[0m             \u001B[0mreset\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;32mFalse\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1048\u001B[0m             \u001B[0maccept_sparse\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m\"csr\"\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mC:\\ProgramData\\anaconda3\\envs\\AI\\lib\\site-packages\\sklearn\\base.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001B[0m\n\u001B[0;32m    629\u001B[0m                 \u001B[0mout\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0my\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    630\u001B[0m             \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    631\u001B[0m                 \u001B[0mout\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0my\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    632\u001B[0m         \u001B[1;32melif\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[0mno_val_X\u001B[0m \u001B[1;32mand\u001B[0m \u001B[0mno_val_y\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 633\u001B[1;33m             \u001B[0mout\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mcheck_array\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0minput_name\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m\"X\"\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mcheck_params\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m    634\u001B[0m         \u001B[1;32melif\u001B[0m \u001B[0mno_val_X\u001B[0m \u001B[1;32mand\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[0mno_val_y\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    635\u001B[0m             \u001B[0mout\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0m_check_y\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0my\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mcheck_params\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    636\u001B[0m         \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mC:\\ProgramData\\anaconda3\\envs\\AI\\lib\\site-packages\\sklearn\\utils\\validation.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001B[0m\n\u001B[0;32m   1009\u001B[0m                         )\n\u001B[0;32m   1010\u001B[0m                     \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mastype\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mcopy\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;32mFalse\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1011\u001B[0m                 \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1012\u001B[0m                     \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0m_asarray_with_order\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0morder\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0morder\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mxp\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 1013\u001B[1;33m             \u001B[1;32mexcept\u001B[0m \u001B[0mComplexWarning\u001B[0m \u001B[1;32mas\u001B[0m \u001B[0mcomplex_warning\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m   1014\u001B[0m                 raise ValueError(\n\u001B[0;32m   1015\u001B[0m                     \u001B[1;34m\"Complex data not supported\\n{}\\n\"\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mformat\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   1016\u001B[0m                 ) from complex_warning\n",
      "\u001B[1;32mC:\\ProgramData\\anaconda3\\envs\\AI\\lib\\site-packages\\sklearn\\utils\\_array_api.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(array, dtype, order, copy, xp, device)\u001B[0m\n\u001B[0;32m    747\u001B[0m         \u001B[1;31m# Use NumPy API to support order\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    748\u001B[0m         \u001B[1;32mif\u001B[0m \u001B[0mcopy\u001B[0m \u001B[1;32mis\u001B[0m \u001B[1;32mTrue\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    749\u001B[0m             \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mnumpy\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0morder\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0morder\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    750\u001B[0m         \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 751\u001B[1;33m             \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mnumpy\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0masarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0morder\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0morder\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m    752\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    753\u001B[0m         \u001B[1;31m# At this point array is a NumPy ndarray. We convert it to an array\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m    754\u001B[0m         \u001B[1;31m# container that is consistent with the input's namespace.\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mC:\\ProgramData\\anaconda3\\envs\\AI\\lib\\site-packages\\pandas\\core\\generic.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, dtype, copy)\u001B[0m\n\u001B[0;32m   2149\u001B[0m     def __array__(\n\u001B[0;32m   2150\u001B[0m         \u001B[0mself\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m:\u001B[0m \u001B[0mnpt\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mDTypeLike\u001B[0m \u001B[1;33m|\u001B[0m \u001B[1;32mNone\u001B[0m \u001B[1;33m=\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mcopy\u001B[0m\u001B[1;33m:\u001B[0m \u001B[0mbool_t\u001B[0m \u001B[1;33m|\u001B[0m \u001B[1;32mNone\u001B[0m \u001B[1;33m=\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   2151\u001B[0m     ) -> np.ndarray:\n\u001B[0;32m   2152\u001B[0m         \u001B[0mvalues\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_values\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 2153\u001B[1;33m         \u001B[0marr\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mnp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0masarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mvalues\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m   2154\u001B[0m         if (\n\u001B[0;32m   2155\u001B[0m             \u001B[0mastype_is_view\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mvalues\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0marr\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   2156\u001B[0m             \u001B[1;32mand\u001B[0m \u001B[0musing_copy_on_write\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;31mValueError\u001B[0m: could not convert string to float: 'ZERO'"
     ]
    }
   ],
   "execution_count": 63
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T14:09:48.970481Z",
     "start_time": "2024-09-18T14:09:48.956483Z"
    }
   },
   "cell_type": "code",
   "source": "print(f1)",
   "id": "ccd06ef8b06c98a1",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8371721052057187\n"
     ]
    }
   ],
   "execution_count": 56
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### Save Scaler and Cols",
   "id": "489313be13e66c00"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T14:18:30.797406Z",
     "start_time": "2024-09-18T14:18:30.769400Z"
    }
   },
   "cell_type": "code",
   "source": [
    "with open(f'model/columns.pkl', 'wb') as f:\n",
    "    pickle.dump(cols.to_list(), f)\n",
    "\n",
    "with open(f'model/scaler.pkl', 'wb') as f:\n",
    "    pickle.dump(sc, f)\n",
    "\n",
    "with open(f'model/model.pkl', 'wb') as f:\n",
    "    pickle.dump(rf, f)"
   ],
   "id": "4b257b89edd730ba",
   "outputs": [],
   "execution_count": 64
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Loop",
   "id": "4bb21a9641b6823"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T13:58:15.066193Z",
     "start_time": "2024-09-18T13:57:49.137112Z"
    }
   },
   "cell_type": "code",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "0.8465739750445631\n",
      "--------------------\n",
      "1\n",
      "0.8587816632306016\n",
      "--------------------\n",
      "2\n",
      "0.8692866312086448\n",
      "--------------------\n",
      "3\n",
      "0.8277746239161031\n",
      "--------------------\n",
      "4\n",
      "0.8769936338171633\n",
      "--------------------\n"
     ]
    }
   ],
   "execution_count": 43,
   "source": [
    "best_preprocess_params =  {'missing_value_per': 0.4, 'variance_threshold': 0.05, 'min_null_per': 0.5}\n",
    "best_model_params = {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 250}\n",
    "best_model_name = 'random_forest'\n",
    "\n",
    "f1_list = []\n",
    "for i in range(5):\n",
    "    print(i)\n",
    "    i+=1\n",
    "    train, test = train_test_split(df, test_size=0.2, shuffle=True)\n",
    "    \n",
    "    preprocessor = Preprocess(\n",
    "        dataframe=train,\n",
    "        missing_value_per=best_preprocess_params['missing_value_per'],\n",
    "        variance_threshold=best_preprocess_params['variance_threshold'],\n",
    "        min_null_per=best_preprocess_params['min_null_per']\n",
    "    )\n",
    "    preprocessor.apply()\n",
    "    preprocessed_train = preprocessor.dataframe\n",
    "    \n",
    "    m = MissingValue(preprocessed_train)\n",
    "    preprocessed_train = m.fill_dataframe()\n",
    "    \n",
    "    test = preprocessor._mapping(test)\n",
    "    m = MissingValue(test)\n",
    "    test = m.fill_dataframe()\n",
    "    \n",
    "    sc = StandardScaler()\n",
    "    x_train = preprocessed_train.drop(['target label / yes no'], axis=1)\n",
    "    cols = x_train.columns\n",
    "    \n",
    "    x_train = sc.fit_transform(x_train)\n",
    "    y_train = preprocessed_train['target label / yes no'].values.astype('int')\n",
    "    \n",
    "    xg = XGBClassifier(\n",
    "        subsample=1.0,\n",
    "        colsample_bytree=0.8,\n",
    "        learning_rate=0.1,\n",
    "        max_depth=3,\n",
    "        n_estimators=100\n",
    "        \n",
    "    )\n",
    "    xg.fit(x_train, y_train)\n",
    "    # rf = RandomForestClassifier(\n",
    "    #     bootstrap=best_model_params['bootstrap'],\n",
    "    #     max_depth=best_model_params['max_depth'],\n",
    "    #     min_samples_split=best_model_params['min_samples_split'],\n",
    "    #     min_samples_leaf=best_model_params['min_samples_leaf'],\n",
    "    #     n_estimators=best_model_params['n_estimators']\n",
    "    # )\n",
    "    # rf.fit(x_train, y_train)\n",
    "    \n",
    "    x_test = test.drop(['target label / yes no'], axis=1)\n",
    "    x_test = x_test[cols]\n",
    "    # f = FeatureEngineering(x_test)\n",
    "    # x_test = f.kmeans(5).values\n",
    "    x_test = sc.transform(x_test)\n",
    "    y_test = test['target label / yes no'].values.astype('int')\n",
    "    \n",
    "    # y_pred = rf.predict(x_test)\n",
    "    y_pred = xg.predict(x_test)\n",
    "    \n",
    "    f1 = f1_score(y_test, y_pred, average='weighted')\n",
    "    print(f1)\n",
    "    print('-'*20)\n",
    "    f1_list.append(f1)"
   ],
   "id": "2541c2502467f414"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T13:58:15.931329Z",
     "start_time": "2024-09-18T13:58:15.915334Z"
    }
   },
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8558821054434151"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 44,
   "source": "sum(f1_list)/len(f1_list)",
   "id": "e999e391b7942a67"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T13:56:24.034247Z",
     "start_time": "2024-09-18T13:56:24.022243Z"
    }
   },
   "cell_type": "code",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0.7916205533596838,\n",
       " 0.843782847316944,\n",
       " 0.8567467859318522,\n",
       " 0.8484060247934703,\n",
       " 0.841208207987869]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 42,
   "source": "f1_list",
   "id": "7a4e0d32bc1dd26f"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T15:41:32.938129Z",
     "start_time": "2024-09-18T15:41:32.906823Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import pickle \n",
    "with open(f'model/columns.pkl', 'rb') as f:\n",
    "    cols = pickle.load(f)\n",
    "\n",
    "with open(f'model/scaler.pkl', 'rb') as f:\n",
    "    sc = pickle.load(f)\n",
    "\n",
    "with open(f'model/model.pkl', 'rb') as f:\n",
    "    model = pickle.load(f)"
   ],
   "id": "6b4773d45b76890e",
   "outputs": [],
   "execution_count": 4
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T15:41:15.600339Z",
     "start_time": "2024-09-18T15:41:09.822041Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import pandas as pd \n",
    "df = pd.read_excel('dataset.xlsx',engine='openpyxl')\n",
    "df.drop(['Unnamed: 0', 'visit id'], axis=1, inplace=True)\n"
   ],
   "id": "89de2953628be5e3",
   "outputs": [],
   "execution_count": 1
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T15:41:36.959991Z",
     "start_time": "2024-09-18T15:41:36.879965Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from utils import Preprocess\n",
    "pre = Preprocess(df, missing_value_per=0.4, variance_threshold=0.05, min_null_per=0.5)\n",
    "df = pre._mapping(df)\n",
    "cols.append('target label / yes no')\n",
    "df = df[cols]\n",
    "X = df.drop(['target label / yes no'], axis=1)\n",
    "X = sc.transform(X)\n",
    "y = df['target label / yes no'].fillna(0).to_numpy()\n",
    "y_pre = model.predict(X)"
   ],
   "id": "3e387344a200e132",
   "outputs": [],
   "execution_count": 5
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T15:41:39.115344Z",
     "start_time": "2024-09-18T15:41:39.104343Z"
    }
   },
   "cell_type": "code",
   "source": "y",
   "id": "11c53c9d426b25ac",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, ..., 0, 0, 0], dtype=object)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 6
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-18T15:41:54.021876Z",
     "start_time": "2024-09-18T15:41:53.999870Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from sklearn.metrics import f1_score\n",
    "\n",
    "f1_score(y.astype(int), y_pre.astype(int), average='weighted')"
   ],
   "id": "78c1b6bcf860b84f",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9027694811920448"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 7
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}