--- a
+++ b/Classification pCR/.ipynb_checkpoints/FinalPCR-checkpoint.ipynb
@@ -0,0 +1,917 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "3907693e-9962-46d3-b895-73bf3649a737",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from sklearn.metrics import balanced_accuracy_score, classification_report"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "16438710-37d5-49b8-9d71-34d37bf5d70c",
+   "metadata": {},
+   "source": [
+    "### Load the Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "334ad187-b8b3-422e-9270-75d6112ba33b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the dataset\n",
+    "df = pd.read_excel(\"TrainDataset2024.xls\")\n",
+    "df=df.drop('ID',axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b2ad4ed8-37ba-48cc-8f71-4d619b366ee9",
+   "metadata": {},
+   "source": [
+    "### Data preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "4533e952-af49-42c2-8d1d-943bc2fbd83f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(308, 120)"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#replacing the 999 value to nan for easy removal\n",
+    "\n",
+    "df.replace(999, np.nan, inplace= True)\n",
+    "df.dropna(inplace =True)\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "4dd462f5-d8cf-4bfa-90ee-0ba54f2fb730",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.rename(columns={\"pCR (outcome)\": \"PCR\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "67bb4488-3630-44b7-a051-ac7ace0037a4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 640x480 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# plotting for zero and one value count in PCR column\n",
+    "outcomes = df['PCR'].value_counts()\n",
+    "outcome_labels =['0','1']\n",
+    "outcome_values = outcomes.values\n",
+    "plt.bar(outcome_labels, outcome_values)\n",
+    "plt.xlabel('Outcome')\n",
+    "plt.ylabel('Count')\n",
+    "plt.title('Distribution of Outcomes in PCR')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "afc5a1a4-e5ad-4afd-966c-64ad52b46ad7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of outliers: 62\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Calculate Z-scores for the entire dataset\n",
+    "z_scores = np.abs((df - df.mean()) / df.std())\n",
+    "\n",
+    "# Identify rows that have z-scores above the threshold (3 in this case)\n",
+    "outliers = (z_scores > 3).any(axis=1)\n",
+    "\n",
+    "# Print the number of outliers\n",
+    "print(f\"Number of outliers: {outliers.sum()}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "556d9ea6-739f-4a00-a8ad-cc1a1e16cf5a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Remove outliers from the dataset\n",
+    "df_no_outliers = df[~outliers]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "f656af78-2810-4e60-8c23-1199932871df",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "correlated features:  82\n"
+     ]
+    }
+   ],
+   "source": [
+    "df1 = df.copy()\n",
+    "# checking for correlated features of dataset\n",
+    "def correlation(data, threshold):\n",
+    "    col_corr = {}  # Dictionary to store correlated features\n",
+    "    corr_matrix = data.corr()\n",
+    "    for i in range(len(corr_matrix.columns)):\n",
+    "        for j in range(i):\n",
+    "            if abs(corr_matrix.iloc[i, j]) > threshold:  # We are interested in absolute coefficient value\n",
+    "                colname = corr_matrix.columns[i]\n",
+    "                if colname not in col_corr:\n",
+    "                    col_corr[colname] = set()\n",
+    "                col_corr[colname].add(corr_matrix.columns[j])\n",
+    "\n",
+    "    return col_corr\n",
+    "\n",
+    "corr_features = correlation(df1, 0.8)\n",
+    "print('correlated features: ', len(corr_features))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "505a4523-9cde-4915-bb9e-7d44833b3103",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(308, 38)"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# removing the correlated features\n",
+    "df_corr= df1.drop(labels=corr_features, axis=1)\n",
+    "df_corr.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "1d8f6f55-40a4-4d68-953b-ebb3e8b5164c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# dataset using correlated feature selection\n",
+    "X_corr = df_corr.drop([\"PCR\",\"RelapseFreeSurvival (outcome)\"],axis=1)\n",
+    "y_corr = df_corr[\"PCR\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "812b6f93-e64a-4fed-a1f4-105d02ddd195",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Age</th>\n",
+       "      <th>ER</th>\n",
+       "      <th>PgR</th>\n",
+       "      <th>HER2</th>\n",
+       "      <th>TrippleNegative</th>\n",
+       "      <th>ChemoGrade</th>\n",
+       "      <th>HistologyType</th>\n",
+       "      <th>LNStatus</th>\n",
+       "      <th>TumourStage</th>\n",
+       "      <th>Gene</th>\n",
+       "      <th>...</th>\n",
+       "      <th>original_gldm_DependenceEntropy</th>\n",
+       "      <th>original_gldm_SmallDependenceEmphasis</th>\n",
+       "      <th>original_glrlm_LongRunLowGrayLevelEmphasis</th>\n",
+       "      <th>original_glrlm_ShortRunHighGrayLevelEmphasis</th>\n",
+       "      <th>original_glszm_GrayLevelNonUniformity</th>\n",
+       "      <th>original_glszm_GrayLevelNonUniformityNormalized</th>\n",
+       "      <th>original_glszm_SizeZoneNonUniformityNormalized</th>\n",
+       "      <th>original_glszm_SmallAreaEmphasis</th>\n",
+       "      <th>original_ngtdm_Busyness</th>\n",
+       "      <th>original_ngtdm_Strength</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>41.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>4.529461</td>\n",
+       "      <td>0.005563</td>\n",
+       "      <td>10.779989</td>\n",
+       "      <td>0.789987</td>\n",
+       "      <td>27.545455</td>\n",
+       "      <td>0.834711</td>\n",
+       "      <td>0.180900</td>\n",
+       "      <td>0.403535</td>\n",
+       "      <td>473.464852</td>\n",
+       "      <td>0.000758</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>39.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2.799725</td>\n",
+       "      <td>0.006518</td>\n",
+       "      <td>27.650685</td>\n",
+       "      <td>0.442279</td>\n",
+       "      <td>78.025000</td>\n",
+       "      <td>0.975313</td>\n",
+       "      <td>0.198125</td>\n",
+       "      <td>0.444391</td>\n",
+       "      <td>59.459710</td>\n",
+       "      <td>0.003685</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>31.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2.863091</td>\n",
+       "      <td>0.007181</td>\n",
+       "      <td>25.338218</td>\n",
+       "      <td>0.503046</td>\n",
+       "      <td>72.027027</td>\n",
+       "      <td>0.973338</td>\n",
+       "      <td>0.275749</td>\n",
+       "      <td>0.534549</td>\n",
+       "      <td>33.935384</td>\n",
+       "      <td>0.006447</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>35.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2.756432</td>\n",
+       "      <td>0.004902</td>\n",
+       "      <td>31.461354</td>\n",
+       "      <td>0.399896</td>\n",
+       "      <td>99.019802</td>\n",
+       "      <td>0.980394</td>\n",
+       "      <td>0.253014</td>\n",
+       "      <td>0.506185</td>\n",
+       "      <td>46.859265</td>\n",
+       "      <td>0.004543</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>61.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>2.676170</td>\n",
+       "      <td>0.007222</td>\n",
+       "      <td>27.916261</td>\n",
+       "      <td>0.473278</td>\n",
+       "      <td>56.034483</td>\n",
+       "      <td>0.966112</td>\n",
+       "      <td>0.216409</td>\n",
+       "      <td>0.462282</td>\n",
+       "      <td>39.621023</td>\n",
+       "      <td>0.005626</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 36 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    Age  ER  PgR  HER2  TrippleNegative  ChemoGrade  HistologyType  LNStatus  \\\n",
+       "0  41.0   0  0.0   0.0              1.0         3.0            1.0       1.0   \n",
+       "1  39.0   1  1.0   0.0              0.0         3.0            1.0       1.0   \n",
+       "2  31.0   0  0.0   0.0              1.0         2.0            1.0       0.0   \n",
+       "3  35.0   0  0.0   0.0              1.0         3.0            1.0       1.0   \n",
+       "4  61.0   1  0.0   0.0              0.0         2.0            1.0       0.0   \n",
+       "\n",
+       "   TumourStage  Gene  ...  original_gldm_DependenceEntropy  \\\n",
+       "0            2   1.0  ...                         4.529461   \n",
+       "1            2   0.0  ...                         2.799725   \n",
+       "2            2   1.0  ...                         2.863091   \n",
+       "3            3   1.0  ...                         2.756432   \n",
+       "4            2   1.0  ...                         2.676170   \n",
+       "\n",
+       "   original_gldm_SmallDependenceEmphasis  \\\n",
+       "0                               0.005563   \n",
+       "1                               0.006518   \n",
+       "2                               0.007181   \n",
+       "3                               0.004902   \n",
+       "4                               0.007222   \n",
+       "\n",
+       "   original_glrlm_LongRunLowGrayLevelEmphasis  \\\n",
+       "0                                   10.779989   \n",
+       "1                                   27.650685   \n",
+       "2                                   25.338218   \n",
+       "3                                   31.461354   \n",
+       "4                                   27.916261   \n",
+       "\n",
+       "   original_glrlm_ShortRunHighGrayLevelEmphasis  \\\n",
+       "0                                      0.789987   \n",
+       "1                                      0.442279   \n",
+       "2                                      0.503046   \n",
+       "3                                      0.399896   \n",
+       "4                                      0.473278   \n",
+       "\n",
+       "   original_glszm_GrayLevelNonUniformity  \\\n",
+       "0                              27.545455   \n",
+       "1                              78.025000   \n",
+       "2                              72.027027   \n",
+       "3                              99.019802   \n",
+       "4                              56.034483   \n",
+       "\n",
+       "   original_glszm_GrayLevelNonUniformityNormalized  \\\n",
+       "0                                         0.834711   \n",
+       "1                                         0.975313   \n",
+       "2                                         0.973338   \n",
+       "3                                         0.980394   \n",
+       "4                                         0.966112   \n",
+       "\n",
+       "   original_glszm_SizeZoneNonUniformityNormalized  \\\n",
+       "0                                        0.180900   \n",
+       "1                                        0.198125   \n",
+       "2                                        0.275749   \n",
+       "3                                        0.253014   \n",
+       "4                                        0.216409   \n",
+       "\n",
+       "   original_glszm_SmallAreaEmphasis  original_ngtdm_Busyness  \\\n",
+       "0                          0.403535               473.464852   \n",
+       "1                          0.444391                59.459710   \n",
+       "2                          0.534549                33.935384   \n",
+       "3                          0.506185                46.859265   \n",
+       "4                          0.462282                39.621023   \n",
+       "\n",
+       "   original_ngtdm_Strength  \n",
+       "0                 0.000758  \n",
+       "1                 0.003685  \n",
+       "2                 0.006447  \n",
+       "3                 0.004543  \n",
+       "4                 0.005626  \n",
+       "\n",
+       "[5 rows x 36 columns]"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_corr.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "513bda72-85cc-4dbe-9d75-5c3ccf1f7835",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# saving the feature in text file for future use in model training and prediction\n",
+    "with open('30cor.txt', 'w') as f:\n",
+    "    for feature in X_corr.columns:\n",
+    "        f.write(feature + '\\n')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "2ce3c60a-87ec-4145-81b7-c0dd29a211d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "# Split the correlated feature into training and testing sets ( correlated features)\n",
+    "X_train1, X_test1, y_train1, y_test1 = train_test_split(X_corr, y_corr, test_size=0.2,shuffle=True,random_state=42)\n",
+    "# Standardize the features\n",
+    "scaler = StandardScaler()\n",
+    "X_train_sc1 = scaler.fit_transform(X_train1)\n",
+    "X_test_sc1 = scaler.fit_transform(X_test1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "8982309b-a9b3-44ff-9267-01ec13cd0431",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((246, 36),)"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train1.shape, "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "1a06c806-469c-41d1-9160-3cf9028a4cf5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# dataset using correlated feature selection for forward feature selection\n",
+    "X_for = df_corr.drop([\"PCR\",\"RelapseFreeSurvival (outcome)\"],axis=1)\n",
+    "y_for = df_corr[\"PCR\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "42c12a4b-0e06-45dd-b178-8108db5b7125",
+   "metadata": {},
+   "source": [
+    "### Random forest Final Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "aa1edb33-ce99-4800-8b37-d0dd3d209a2f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import joblib\n",
+    "import matplotlib.pyplot as plt\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.model_selection import train_test_split, learning_curve\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.metrics import classification_report, balanced_accuracy_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "70de1f75-c04c-40b4-b884-1aaf6f06cec0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "         0.0       0.80      1.00      0.89        74\n",
+      "         1.0       0.00      0.00      0.00        19\n",
+      "\n",
+      "    accuracy                           0.80        93\n",
+      "   macro avg       0.40      0.50      0.44        93\n",
+      "weighted avg       0.63      0.80      0.71        93\n",
+      "\n",
+      "\n",
+      "Initial Balanced Accuracy Score:\n",
+      "0.5\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
+      "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
+      "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
+      "  _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Splitting the data into training and testing sets\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_corr, y_corr, test_size=0.3, shuffle=True, random_state=64)\n",
+    "\n",
+    "# Standardizing the features\n",
+    "scaler = StandardScaler()\n",
+    "X_train_norm = scaler.fit_transform(X_train)\n",
+    "X_test_norm = scaler.transform(X_test)\n",
+    "\n",
+    "# Initial training of Random Forest model (without SMOTE)\n",
+    "rf_classifier = RandomForestClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=15, n_estimators=25, random_state=42)\n",
+    "rf_classifier.fit(X_train_norm, y_train)\n",
+    "\n",
+    "# Predicting class labels for testing data (initial prediction)\n",
+    "y_pred = rf_classifier.predict(X_test_norm)\n",
+    "\n",
+    "# Calculating classification report and balanced accuracy score (initial performance)\n",
+    "report = classification_report(y_test, y_pred)\n",
+    "balanced_accuracy = balanced_accuracy_score(y_test, y_pred)\n",
+    "\n",
+    "print('Initial Classification Report:')\n",
+    "print(report)\n",
+    "\n",
+    "print('\\nInitial Balanced Accuracy Score:')\n",
+    "print(balanced_accuracy)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "4b3632e2-2072-46a3-ad66-076cfb235094",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Classification Report After SMOTE:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "         0.0       0.87      0.88      0.87        74\n",
+      "         1.0       0.50      0.47      0.49        19\n",
+      "\n",
+      "    accuracy                           0.80        93\n",
+      "   macro avg       0.68      0.68      0.68        93\n",
+      "weighted avg       0.79      0.80      0.79        93\n",
+      "\n",
+      "\n",
+      "Balanced Accuracy Score After SMOTE:\n",
+      "0.6760312944523471\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Now applying SMOTE for oversampling the minority class in the training data\n",
+    "smote = SMOTE(random_state=42)\n",
+    "X_train_smote, y_train_smote = smote.fit_resample(X_train_norm, y_train)\n",
+    "\n",
+    "# Retraining the Random Forest model on the SMOTE-resampled data\n",
+    "rf_classifier_smote = RandomForestClassifier(max_depth=5, min_samples_leaf=3, min_samples_split=15, n_estimators=25, random_state=42)\n",
+    "rf_classifier_smote.fit(X_train_smote, y_train_smote)\n",
+    "\n",
+    "# Predicting class labels for testing data (after SMOTE resampling)\n",
+    "y_pred_smote = rf_classifier_smote.predict(X_test_norm)\n",
+    "\n",
+    "# Calculating classification report and balanced accuracy score (after SMOTE)\n",
+    "report_smote = classification_report(y_test, y_pred_smote)\n",
+    "balanced_accuracy_smote = balanced_accuracy_score(y_test, y_pred_smote)\n",
+    "\n",
+    "print('\\nClassification Report After SMOTE:')\n",
+    "print(report_smote)\n",
+    "\n",
+    "print('\\nBalanced Accuracy Score After SMOTE:')\n",
+    "print(balanced_accuracy_smote)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "2338844d-31b3-4d07-ac64-fbc4c58a8a2c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy for each fold:\n",
+      "Fold 0: 0.8000\n",
+      "Fold 1: 0.8000\n",
+      "Fold 2: 0.8000\n",
+      "Fold 3: 0.9200\n",
+      "Fold 4: 0.8000\n",
+      "Fold 5: 0.8000\n",
+      "Fold 6: 0.8750\n",
+      "Fold 7: 0.7917\n",
+      "Fold 8: 0.8750\n",
+      "Fold 9: 0.8750\n",
+      "Average accuracy: 0.8337\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Ensure that you are using X_train and y_train for cross-validation, not X_train and y_corr\n",
+    "from sklearn.model_selection import train_test_split, KFold, cross_val_score\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "# Split the data\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_corr, y_corr, test_size=0.2, shuffle=True, random_state=60)\n",
+    "\n",
+    "# Initialize the model\n",
+    "rf_classifier = RandomForestClassifier(max_depth=15, min_samples_leaf=2, min_samples_split=15, n_estimators=1000, random_state=42)\n",
+    "\n",
+    "# Perform 10-fold cross-validation on the dataset using RandomForest\n",
+    "k = KFold(n_splits=10)\n",
+    "scores = cross_val_score(rf_classifier, X_train, y_train, cv=k)\n",
+    "\n",
+    "# Printing the accuracy of each fold\n",
+    "print(\"Accuracy for each fold:\")\n",
+    "for i in range(len(scores)):\n",
+    "    print(f\"Fold {i}: {scores[i]:.4f}\")\n",
+    "\n",
+    "# Printing the average accuracy\n",
+    "average_accuracy = np.mean(scores)\n",
+    "print(f\"Average accuracy: {average_accuracy:.4f}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "306361af-a43c-4bc0-b205-6145f4c8cf8c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 1000x600 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Plotting Learning Curve for Random Forest model with SMOTE\n",
+    "train_sizes, train_scores, test_scores = learning_curve(\n",
+    "    rf_classifier_smote, X_train_smote, y_train_smote, cv=5, n_jobs=-1,\n",
+    "    train_sizes=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]\n",
+    ")\n",
+    "\n",
+    "# Calculate mean and std for train and test scores\n",
+    "train_mean = train_scores.mean(axis=1)\n",
+    "test_mean = test_scores.mean(axis=1)\n",
+    "train_std = train_scores.std(axis=1)\n",
+    "test_std = test_scores.std(axis=1)\n",
+    "\n",
+    "# Plotting the learning curve\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(train_sizes, train_mean, label='Training score', color='blue')\n",
+    "plt.plot(train_sizes, test_mean, label='Cross-validation score', color='green')\n",
+    "\n",
+    "# Plotting the confidence intervals\n",
+    "plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color='blue', alpha=0.2)\n",
+    "plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color='green', alpha=0.2)\n",
+    "\n",
+    "plt.title('Learning Curve for Random Forest Classifier with SMOTE')\n",
+    "plt.xlabel('Training Set Size')\n",
+    "plt.ylabel('Score')\n",
+    "plt.legend()\n",
+    "plt.grid(True)\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "57e9f657-5172-408e-ac03-80fc36f37031",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['scaler.pkl']"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Save the Random Forest model and scaler using joblib\n",
+    "joblib.dump(rf_classifier_smote, 'rf_classifier_smote.pkl')  # Save the trained model\n",
+    "joblib.dump(scaler, 'scaler.pkl')  # Save the scaler used for standardizing the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "bb709a33-09a0-49cd-ab95-72410f72e254",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Predictions on Test Dataset: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.\n",
+      " 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
+      " 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0.\n",
+      " 1. 1. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0.\n",
+      " 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0.\n",
+      " 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.]\n",
+      "Number of 1s in predictions: 41\n",
+      "Number of 0s in predictions: 92\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ---- Loading the model and scaler from the pickle file ----\n",
+    "# Load the trained model and scaler from the saved pickle files\n",
+    "loaded_rf_model = joblib.load('rf_classifier_smote.pkl')\n",
+    "loaded_scaler = joblib.load('scaler.pkl')\n",
+    "\n",
+    "# Now, let's use the loaded model and scaler to make predictions on new data\n",
+    "# Assume df_test is your test data\n",
+    "df_test = pd.read_excel(\"FinalTestDataset2024.xls\")\n",
+    "\n",
+    "# Apply the same feature selection\n",
+    "with open('30cor.txt', 'r') as file:\n",
+    "    features = [line.strip() for line in file.readlines()]\n",
+    "\n",
+    "df_test_selected = df_test[features]\n",
+    "\n",
+    "# Transform the test data using the loaded scaler\n",
+    "X_test_sc = loaded_scaler.transform(df_test_selected)\n",
+    "\n",
+    "# Make predictions using the loaded Random Forest model\n",
+    "y_pred_loaded = loaded_rf_model.predict(X_test_sc)\n",
+    "\n",
+    "# Count occurrences of predicted classes\n",
+    "count_1 = np.sum(y_pred_loaded == 1)\n",
+    "count_0 = np.sum(y_pred_loaded == 0)\n",
+    "\n",
+    "# Print predictions and counts\n",
+    "print(\"Predictions on Test Dataset:\", y_pred_loaded)\n",
+    "print(f\"Number of 1s in predictions: {count_1}\")\n",
+    "print(f\"Number of 0s in predictions: {count_0}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "d952d9a5-d5d6-4e68-827e-c9f9915a306c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Predictions have been saved to finalPCRoutput.xlsx\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# Ensure the 'ID' column exists\n",
+    "if 'ID' not in df_test.columns:\n",
+    "    raise ValueError(\"The input test dataset must contain an 'ID' column.\")\n",
+    "\n",
+    "# Create a DataFrame for predictions\n",
+    "output_df = pd.DataFrame({\n",
+    "    'ID': df_test['ID'],\n",
+    "    'Prediction': y_pred_loaded\n",
+    "})\n",
+    "\n",
+    "# Save the predictions to an Excel file in 'xlsx' format\n",
+    "output_file_name = 'finalPCRoutput.xlsx'\n",
+    "output_df.to_excel(output_file_name, index=False)\n",
+    "\n",
+    "print(f\"Predictions have been saved to {output_file_name}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5cc9187-dc3a-4778-a09c-a52157133309",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}