--- a +++ b/Classification pCR/Classification_Dev.ipynb @@ -0,0 +1,2083 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b348fcda-1cd7-424f-aafb-f0fb5f4fe94d", + "metadata": {}, + "source": [ + "### Loading dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "564c5a25-bc61-4d3d-acca-d54d998bdb4a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ID</th>\n", + " <th>pCR (outcome)</th>\n", + " <th>RelapseFreeSurvival (outcome)</th>\n", + " <th>Age</th>\n", + " <th>ER</th>\n", + " <th>PgR</th>\n", + " <th>HER2</th>\n", + " <th>TrippleNegative</th>\n", + " <th>ChemoGrade</th>\n", + " <th>Proliferation</th>\n", + " <th>...</th>\n", + " <th>original_glszm_SmallAreaHighGrayLevelEmphasis</th>\n", + " <th>original_glszm_SmallAreaLowGrayLevelEmphasis</th>\n", + " <th>original_glszm_ZoneEntropy</th>\n", + " <th>original_glszm_ZonePercentage</th>\n", + " <th>original_glszm_ZoneVariance</th>\n", + " <th>original_ngtdm_Busyness</th>\n", + " <th>original_ngtdm_Coarseness</th>\n", + " <th>original_ngtdm_Complexity</th>\n", + " <th>original_ngtdm_Contrast</th>\n", + " <th>original_ngtdm_Strength</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>TRG002174</td>\n", + " <td>1</td>\n", + " <td>144.0</td>\n", + " <td>41.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>3</td>\n", + " <td>...</td>\n", + " <td>0.517172</td>\n", + " <td>0.375126</td>\n", + " <td>3.325332</td>\n", + " <td>0.002314</td>\n", + " <td>3880771.500</td>\n", + " <td>473.464852</td>\n", + " <td>0.000768</td>\n", + " <td>0.182615</td>\n", + " <td>0.030508</td>\n", + " <td>0.000758</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>TRG002178</td>\n", + " <td>0</td>\n", + " <td>142.0</td>\n", + " <td>39.0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>3</td>\n", + " <td>3</td>\n", + " <td>...</td>\n", + " <td>0.444391</td>\n", + " <td>0.444391</td>\n", + " <td>3.032144</td>\n", + " <td>0.005612</td>\n", + " <td>2372009.744</td>\n", + " <td>59.459710</td>\n", + " <td>0.004383</td>\n", + " <td>0.032012</td>\n", + " <td>0.001006</td>\n", + " <td>0.003685</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>TRG002204</td>\n", + " <td>1</td>\n", + " <td>135.0</td>\n", + " <td>31.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>...</td>\n", + " <td>0.534549</td>\n", + " <td>0.534549</td>\n", + " <td>2.485848</td>\n", + " <td>0.006752</td>\n", + " <td>1540027.421</td>\n", + " <td>33.935384</td>\n", + " <td>0.007584</td>\n", + " <td>0.024062</td>\n", + " <td>0.000529</td>\n", + " <td>0.006447</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>TRG002206</td>\n", + " <td>0</td>\n", + " <td>12.0</td>\n", + " <td>35.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>3</td>\n", + " <td>...</td>\n", + " <td>0.506185</td>\n", + " <td>0.506185</td>\n", + " <td>2.606255</td>\n", + " <td>0.003755</td>\n", + " <td>6936740.794</td>\n", + " <td>46.859265</td>\n", + " <td>0.005424</td>\n", + " <td>0.013707</td>\n", + " <td>0.000178</td>\n", + " <td>0.004543</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>TRG002210</td>\n", + " <td>0</td>\n", + " <td>109.0</td>\n", + " <td>61.0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>...</td>\n", + " <td>0.462282</td>\n", + " <td>0.462282</td>\n", + " <td>2.809279</td>\n", + " <td>0.006521</td>\n", + " <td>1265399.054</td>\n", + " <td>39.621023</td>\n", + " <td>0.006585</td>\n", + " <td>0.034148</td>\n", + " <td>0.001083</td>\n", + " <td>0.005626</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 121 columns</p>\n", + "</div>" + ], + "text/plain": [ + " ID pCR (outcome) RelapseFreeSurvival (outcome) Age ER PgR \\\n", + "0 TRG002174 1 144.0 41.0 0 0 \n", + "1 TRG002178 0 142.0 39.0 1 1 \n", + "2 TRG002204 1 135.0 31.0 0 0 \n", + "3 TRG002206 0 12.0 35.0 0 0 \n", + "4 TRG002210 0 109.0 61.0 1 0 \n", + "\n", + " HER2 TrippleNegative ChemoGrade Proliferation ... \\\n", + "0 0 1 3 3 ... \n", + "1 0 0 3 3 ... \n", + "2 0 1 2 1 ... \n", + "3 0 1 3 3 ... \n", + "4 0 0 2 1 ... \n", + "\n", + " original_glszm_SmallAreaHighGrayLevelEmphasis \\\n", + "0 0.517172 \n", + "1 0.444391 \n", + "2 0.534549 \n", + "3 0.506185 \n", + "4 0.462282 \n", + "\n", + " original_glszm_SmallAreaLowGrayLevelEmphasis original_glszm_ZoneEntropy \\\n", + "0 0.375126 3.325332 \n", + "1 0.444391 3.032144 \n", + "2 0.534549 2.485848 \n", + "3 0.506185 2.606255 \n", + "4 0.462282 2.809279 \n", + "\n", + " original_glszm_ZonePercentage original_glszm_ZoneVariance \\\n", + "0 0.002314 3880771.500 \n", + "1 0.005612 2372009.744 \n", + "2 0.006752 1540027.421 \n", + "3 0.003755 6936740.794 \n", + "4 0.006521 1265399.054 \n", + "\n", + " original_ngtdm_Busyness original_ngtdm_Coarseness \\\n", + "0 473.464852 0.000768 \n", + "1 59.459710 0.004383 \n", + "2 33.935384 0.007584 \n", + "3 46.859265 0.005424 \n", + "4 39.621023 0.006585 \n", + "\n", + " original_ngtdm_Complexity original_ngtdm_Contrast original_ngtdm_Strength \n", + "0 0.182615 0.030508 0.000758 \n", + "1 0.032012 0.001006 0.003685 \n", + "2 0.024062 0.000529 0.006447 \n", + "3 0.013707 0.000178 0.004543 \n", + "4 0.034148 0.001083 0.005626 \n", + "\n", + "[5 rows x 121 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df= pd.read_excel(\"TrainDataset2024.xls\")\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "6cecd481-9f9e-45a6-9ece-6b43a71b2fb1", + "metadata": {}, + "source": [ + "## Data Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "52341c70-e6d4-4738-8817-eda5398912d8", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.rename(columns={\"pCR (outcome)\": \"PCR\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8457af35-f78b-4595-b162-77fa6f62621d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of null values: 105\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "df.replace(999, np.nan, inplace= True)\n", + "null = df.isna().sum().sum()\n", + "print(\"Number of null values: \",null)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cf17629c-72ca-4fb7-a094-8c26cfd34dfa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Features with Missing Values ===\n", + "PCR 5\n", + "PgR 1\n", + "HER2 1\n", + "TrippleNegative 1\n", + "ChemoGrade 3\n", + "Proliferation 2\n", + "HistologyType 3\n", + "LNStatus 1\n", + "Gene 88\n", + "dtype: int64\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 1200x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Overall Missing Data Summary ===\n", + "Total number of missing values: 105\n", + "Total number of cells in dataset: 48400\n", + "Percentage of missing data: 0.22%\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Replace 999 with NaN for further analysis\n", + "df.replace(999, np.nan, inplace=True)\n", + "\n", + "# Count missing (999) values in each column\n", + "missing_counts = (df.isna().sum())\n", + "\n", + "# Filter out columns with no missing values\n", + "missing_counts = missing_counts[missing_counts > 0]\n", + "\n", + "# Display feature names and missing value counts\n", + "print(\"\\n=== Features with Missing Values ===\")\n", + "print(missing_counts)\n", + "\n", + "# Visualize the distribution of missing values\n", + "plt.figure(figsize=(12, 6))\n", + "missing_counts.plot(kind='bar', color='skyblue', edgecolor='black')\n", + "plt.title('Number of Missing Values by Feature')\n", + "plt.xlabel('Features')\n", + "plt.ylabel('Count of Missing Values')\n", + "plt.xticks(rotation=45, ha='right')\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Summary statistics\n", + "total_missing = missing_counts.sum()\n", + "total_cells = df.size\n", + "missing_percentage = (total_missing / total_cells) * 100\n", + "\n", + "print(\"\\n=== Overall Missing Data Summary ===\")\n", + "print(f\"Total number of missing values: {total_missing}\")\n", + "print(f\"Total number of cells in dataset: {total_cells}\")\n", + "print(f\"Percentage of missing data: {missing_percentage:.2f}%\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "68cedf35-6386-426c-ac64-1127dd4c9935", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PCR 0\n", + "PgR 0\n", + "HER2 0\n", + "TrippleNegative 0\n", + "ChemoGrade 0\n", + "Proliferation 0\n", + "HistologyType 0\n", + "LNStatus 0\n", + "Gene 0\n", + "dtype: int64\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\LLR User\\AppData\\Local\\Temp\\ipykernel_14388\\4226577187.py:12: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " df[feature].fillna(mode_value, inplace=True) # Replace NaN with the mode\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# List of features you want to modify\n", + "features_to_modify = [\n", + " \"PCR\", \"PgR\", \"HER2\", \"TrippleNegative\", \"ChemoGrade\", \n", + " \"Proliferation\", \"HistologyType\", \"LNStatus\", \"Gene\"\n", + "]\n", + "\n", + "# Loop through the columns and replace NaN values with the mode of each column\n", + "for feature in features_to_modify:\n", + " mode_value = df[feature].mode()[0] # Get the mode of the feature\n", + " df[feature].fillna(mode_value, inplace=True) # Replace NaN with the mode\n", + "\n", + "# Check the changes\n", + "print(df[features_to_modify].isna().sum()) # Check how many NaN values are left\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "995b86b2-4475-4b94-8a7e-044bebaf8556", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.drop('ID', axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2f8b60cc-c014-4424-ba37-d7705812d8bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of outliers: 87\n", + "Number of outliers: PCR 65.000000\n", + "RelapseFreeSurvival (outcome) 17185.083333\n", + "Age 16080.576595\n", + "ER 166.000000\n", + "PgR 126.000000\n", + " ... \n", + "original_ngtdm_Busyness 36656.435955\n", + "original_ngtdm_Coarseness 4.608712\n", + "original_ngtdm_Complexity 18.424557\n", + "original_ngtdm_Contrast 1.797244\n", + "original_ngtdm_Strength 4.089319\n", + "Length: 120, dtype: float64\n" + ] + } + ], + "source": [ + "# Calculate Z-scores for the entire dataset\n", + "z_scores = np.abs((df - df.mean()) / df.std())\n", + "\n", + "# Identify rows that have z-scores above the threshold (3 in this case)\n", + "outliers = (z_scores > 3).any(axis=1)\n", + "\n", + "# Print the number of outliers\n", + "print(f\"Number of outliers: {outliers.sum()}\")\n", + "\n", + "# Remove outliers from the dataset\n", + "df_no_outliers = df[~outliers]\n", + "\n", + "print(f\"Number of outliers: {df_no_outliers.sum()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "37f55653-9445-404e-8730-c1c93c45e00f", + "metadata": {}, + "source": [ + "### Feature Selection" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "70899e82-33c6-495d-baaf-e8d6d5b4afa1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "correlated features: 82\n" + ] + } + ], + "source": [ + "df1 = df.copy()\n", + "# checking for correlated features of dataset\n", + "def correlation(data, threshold):\n", + " col_corr = {} # Dictionary to store correlated features\n", + " corr_matrix = data.corr()\n", + " for i in range(len(corr_matrix.columns)):\n", + " for j in range(i):\n", + " if abs(corr_matrix.iloc[i, j]) > threshold: # We are interested in absolute coefficient value\n", + " colname = corr_matrix.columns[i]\n", + " if colname not in col_corr:\n", + " col_corr[colname] = set()\n", + " col_corr[colname].add(corr_matrix.columns[j])\n", + "\n", + " return col_corr\n", + "\n", + "corr_features = correlation(df1, 0.8)\n", + "print('correlated features: ', len(corr_features))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e8695d75-11cd-4f48-b301-d019ce147584", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(400, 38)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# removing the correlated features\n", + "df_corr= df1.drop(labels=corr_features, axis=1)\n", + "df_corr.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "164fcddb-b51c-41b7-b23a-a4f6c7725c84", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# checking the count of target values after data processing and feature selection\n", + "outcomes = df_corr['PCR'].value_counts()\n", + "outcome_labels =['0','1']\n", + "outcome_values = outcomes.values\n", + "plt.bar(outcome_labels, outcome_values)\n", + "plt.xlabel('Outcome')\n", + "plt.ylabel('Count')\n", + "plt.title('Distribution of Outcomes in PCR')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "8f4b2fc4-664b-4c47-af59-e5646acb9aed", + "metadata": {}, + "outputs": [], + "source": [ + "# dataset using correlated feature selection\n", + "X_corr = df_corr.drop([\"PCR\",\"RelapseFreeSurvival (outcome)\"],axis=1)\n", + "y_corr = df_corr[\"PCR\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "683c242f-c3ba-4268-8318-eb01d4f1d9aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Age</th>\n", + " <th>ER</th>\n", + " <th>PgR</th>\n", + " <th>HER2</th>\n", + " <th>TrippleNegative</th>\n", + " <th>ChemoGrade</th>\n", + " <th>HistologyType</th>\n", + " <th>LNStatus</th>\n", + " <th>TumourStage</th>\n", + " <th>Gene</th>\n", + " <th>...</th>\n", + " <th>original_gldm_SmallDependenceEmphasis</th>\n", + " <th>original_glrlm_LongRunLowGrayLevelEmphasis</th>\n", + " <th>original_glrlm_ShortRunHighGrayLevelEmphasis</th>\n", + " <th>original_glszm_GrayLevelNonUniformity</th>\n", + " <th>original_glszm_GrayLevelNonUniformityNormalized</th>\n", + " <th>original_glszm_LargeAreaEmphasis</th>\n", + " <th>original_glszm_SizeZoneNonUniformityNormalized</th>\n", + " <th>original_glszm_SmallAreaEmphasis</th>\n", + " <th>original_ngtdm_Busyness</th>\n", + " <th>original_ngtdm_Strength</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>41.0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>3.0</td>\n", + " <td>1.0</td>\n", + " <td>1.0</td>\n", + " <td>2</td>\n", + " <td>1.0</td>\n", + " <td>...</td>\n", + " <td>0.005563</td>\n", + " <td>10.779989</td>\n", + " <td>0.789987</td>\n", + " <td>27.545455</td>\n", + " <td>0.834711</td>\n", + " <td>4067578.818</td>\n", + " <td>0.180900</td>\n", + " <td>0.403535</td>\n", + " <td>473.464852</td>\n", + " <td>0.000758</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>39.0</td>\n", + " <td>1</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>3.0</td>\n", + " <td>1.0</td>\n", + " <td>1.0</td>\n", + " <td>2</td>\n", + " <td>0.0</td>\n", + " <td>...</td>\n", + " <td>0.006518</td>\n", + " <td>27.650685</td>\n", + " <td>0.442279</td>\n", + " <td>78.025000</td>\n", + " <td>0.975313</td>\n", + " <td>2403756.075</td>\n", + " <td>0.198125</td>\n", + " <td>0.444391</td>\n", + " <td>59.459710</td>\n", + " <td>0.003685</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>31.0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>2.0</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>2</td>\n", + " <td>1.0</td>\n", + " <td>...</td>\n", + " <td>0.007181</td>\n", + " <td>25.338218</td>\n", + " <td>0.503046</td>\n", + " <td>72.027027</td>\n", + " <td>0.973338</td>\n", + " <td>1561963.432</td>\n", + " <td>0.275749</td>\n", + " <td>0.534549</td>\n", + " <td>33.935384</td>\n", + " <td>0.006447</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>35.0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>3.0</td>\n", + " <td>1.0</td>\n", + " <td>1.0</td>\n", + " <td>3</td>\n", + " <td>1.0</td>\n", + " <td>...</td>\n", + " <td>0.004902</td>\n", + " <td>31.461354</td>\n", + " <td>0.399896</td>\n", + " <td>99.019802</td>\n", + " <td>0.980394</td>\n", + " <td>7007670.723</td>\n", + " <td>0.253014</td>\n", + " <td>0.506185</td>\n", + " <td>46.859265</td>\n", + " <td>0.004543</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>61.0</td>\n", + " <td>1</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>2.0</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>2</td>\n", + " <td>1.0</td>\n", + " <td>...</td>\n", + " <td>0.007222</td>\n", + " <td>27.916261</td>\n", + " <td>0.473278</td>\n", + " <td>56.034483</td>\n", + " <td>0.966112</td>\n", + " <td>1288913.690</td>\n", + " <td>0.216409</td>\n", + " <td>0.462282</td>\n", + " <td>39.621023</td>\n", + " <td>0.005626</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 36 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Age ER PgR HER2 TrippleNegative ChemoGrade HistologyType LNStatus \\\n", + "0 41.0 0 0.0 0.0 1.0 3.0 1.0 1.0 \n", + "1 39.0 1 1.0 0.0 0.0 3.0 1.0 1.0 \n", + "2 31.0 0 0.0 0.0 1.0 2.0 1.0 0.0 \n", + "3 35.0 0 0.0 0.0 1.0 3.0 1.0 1.0 \n", + "4 61.0 1 0.0 0.0 0.0 2.0 1.0 0.0 \n", + "\n", + " TumourStage Gene ... original_gldm_SmallDependenceEmphasis \\\n", + "0 2 1.0 ... 0.005563 \n", + "1 2 0.0 ... 0.006518 \n", + "2 2 1.0 ... 0.007181 \n", + "3 3 1.0 ... 0.004902 \n", + "4 2 1.0 ... 0.007222 \n", + "\n", + " original_glrlm_LongRunLowGrayLevelEmphasis \\\n", + "0 10.779989 \n", + "1 27.650685 \n", + "2 25.338218 \n", + "3 31.461354 \n", + "4 27.916261 \n", + "\n", + " original_glrlm_ShortRunHighGrayLevelEmphasis \\\n", + "0 0.789987 \n", + "1 0.442279 \n", + "2 0.503046 \n", + "3 0.399896 \n", + "4 0.473278 \n", + "\n", + " original_glszm_GrayLevelNonUniformity \\\n", + "0 27.545455 \n", + "1 78.025000 \n", + "2 72.027027 \n", + "3 99.019802 \n", + "4 56.034483 \n", + "\n", + " original_glszm_GrayLevelNonUniformityNormalized \\\n", + "0 0.834711 \n", + "1 0.975313 \n", + "2 0.973338 \n", + "3 0.980394 \n", + "4 0.966112 \n", + "\n", + " original_glszm_LargeAreaEmphasis \\\n", + "0 4067578.818 \n", + "1 2403756.075 \n", + "2 1561963.432 \n", + "3 7007670.723 \n", + "4 1288913.690 \n", + "\n", + " original_glszm_SizeZoneNonUniformityNormalized \\\n", + "0 0.180900 \n", + "1 0.198125 \n", + "2 0.275749 \n", + "3 0.253014 \n", + "4 0.216409 \n", + "\n", + " original_glszm_SmallAreaEmphasis original_ngtdm_Busyness \\\n", + "0 0.403535 473.464852 \n", + "1 0.444391 59.459710 \n", + "2 0.534549 33.935384 \n", + "3 0.506185 46.859265 \n", + "4 0.462282 39.621023 \n", + "\n", + " original_ngtdm_Strength \n", + "0 0.000758 \n", + "1 0.003685 \n", + "2 0.006447 \n", + "3 0.004543 \n", + "4 0.005626 \n", + "\n", + "[5 rows x 36 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_corr.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b361c4c4-54c2-4de3-b5d3-c9a276019f36", + "metadata": {}, + "outputs": [], + "source": [ + "# saving the feature in text file for future use in model training and prediction\n", + "with open('30cor.txt', 'w') as f:\n", + " for feature in X_corr.columns:\n", + " f.write(feature + '\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "766f1c86-ea45-4a3d-be13-c1335f27268b", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "# Split the correlated feature into training and testing sets ( correlated features)\n", + "X_train1, X_test1, y_train1, y_test1 = train_test_split(X_corr, y_corr, test_size=0.2,shuffle=True,random_state=42)\n", + "# Standardize the features\n", + "scaler = StandardScaler()\n", + "X_train_sc1 = scaler.fit_transform(X_train1)\n", + "X_test_sc1 = scaler.fit_transform(X_test1)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "790b0a2c-e36e-4f70-ac0f-a2bc8a2f30ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((320, 36),)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train1.shape, " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "08017321-a5fe-4ae4-82c8-8f65b5b9191c", + "metadata": {}, + "outputs": [], + "source": [ + "# dataset using correlated feature selection for forward feature selection\n", + "X_for = df_corr.drop([\"PCR\",\"RelapseFreeSurvival (outcome)\"],axis=1)\n", + "y_for = df_corr[\"PCR\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "9e854be0-bf8e-4ad2-9d12-85584abe4f16", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected features: Index(['ER', 'PgR', 'HER2', 'TrippleNegative', 'ChemoGrade', 'HistologyType',\n", + " 'LNStatus', 'Gene', 'original_shape_Elongation',\n", + " 'original_shape_MeshVolume', 'original_firstorder_InterquartileRange',\n", + " 'original_firstorder_Kurtosis', 'original_glcm_Imc1',\n", + " 'original_gldm_SmallDependenceEmphasis',\n", + " 'original_glrlm_LongRunLowGrayLevelEmphasis',\n", + " 'original_glszm_GrayLevelNonUniformity',\n", + " 'original_glszm_GrayLevelNonUniformityNormalized',\n", + " 'original_glszm_SizeZoneNonUniformityNormalized'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.feature_selection import SequentialFeatureSelector\n", + "\n", + "# Handle missing values in y_for\n", + "y_for = y_for.dropna() # Alternatively, use fillna()\n", + "\n", + "# Ensure X_for matches the cleaned y_for\n", + "X_for = X_for.loc[y_for.index]\n", + "\n", + "# Sequential Feature Selector\n", + "selector = SequentialFeatureSelector(\n", + " estimator=RandomForestClassifier(n_estimators=100),\n", + " n_features_to_select='auto'\n", + ")\n", + "\n", + "# Fit the selector to the data\n", + "selector.fit(X_for, y_for)\n", + "\n", + "# Get the selected features\n", + "selected_features1 = X_for.columns[selector.get_support()]\n", + "print(\"Selected features:\", selected_features1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "662c1167-d3b1-4f56-ab3d-de46dfaf915e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 400 entries, 0 to 399\n", + "Data columns (total 18 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 ER 400 non-null int64 \n", + " 1 PgR 400 non-null float64\n", + " 2 HER2 400 non-null float64\n", + " 3 TrippleNegative 400 non-null float64\n", + " 4 ChemoGrade 400 non-null float64\n", + " 5 HistologyType 400 non-null float64\n", + " 6 LNStatus 400 non-null float64\n", + " 7 Gene 400 non-null float64\n", + " 8 original_shape_Elongation 400 non-null float64\n", + " 9 original_shape_MeshVolume 400 non-null float64\n", + " 10 original_firstorder_InterquartileRange 400 non-null float64\n", + " 11 original_firstorder_Kurtosis 400 non-null float64\n", + " 12 original_glcm_Imc1 400 non-null float64\n", + " 13 original_gldm_SmallDependenceEmphasis 400 non-null float64\n", + " 14 original_glrlm_LongRunLowGrayLevelEmphasis 400 non-null float64\n", + " 15 original_glszm_GrayLevelNonUniformity 400 non-null float64\n", + " 16 original_glszm_GrayLevelNonUniformityNormalized 400 non-null float64\n", + " 17 original_glszm_SizeZoneNonUniformityNormalized 400 non-null float64\n", + "dtypes: float64(17), int64(1)\n", + "memory usage: 56.4 KB\n" + ] + } + ], + "source": [ + "# Create a copy of the dataset with the selected features\n", + "X_selected_for1 = df_corr[selected_features1]\n", + "X_selected_for1.info()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "fd51f64a-7f57-493b-8485-41f3e313a61f", + "metadata": {}, + "outputs": [], + "source": [ + "# X and y for forward feature selection\n", + "df3 = df.copy()\n", + "X2 = df3[selected_features1]\n", + "y2 = df3[\"PCR\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "c6f0e344-3a90-4418-8f20-2646f5e66f13", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(400, 18)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X2.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "d81d0dcb-5b32-4a06-93aa-9c8d97a7d575", + "metadata": {}, + "outputs": [], + "source": [ + "# Splitting the data into training and testing sets ( forward feature selection)\n", + "X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2,shuffle=True,random_state=42)\n", + "# Standardize the features\n", + "scaler = StandardScaler()\n", + "X_train_sc2 = scaler.fit_transform(X_train2)\n", + "X_test_sc2 = scaler.fit_transform(X_test2)\n" + ] + }, + { + "cell_type": "markdown", + "id": "ed7db500-58aa-4326-9dd4-d8c4589bac06", + "metadata": {}, + "source": [ + "## Models" + ] + }, + { + "cell_type": "markdown", + "id": "97247500-54a1-4046-bc15-4d8c2d511e97", + "metadata": {}, + "source": [ + "### Linear Regression " + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "b17e1da1-7a02-493f-a2a1-4907be57276b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.82 0.87 0.85 94\n", + " 1.0 0.40 0.31 0.35 26\n", + "\n", + " accuracy 0.75 120\n", + " macro avg 0.61 0.59 0.60 120\n", + "weighted avg 0.73 0.75 0.74 120\n", + "\n", + "\n", + "Initial Balanced Accuracy Score:\n", + "0.5900163666121113\n" + ] + } + ], + "source": [ + "# Import required libraries\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score\n", + "\n", + "# Splitting the data into training and testing sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X_corr, y_corr, test_size=0.3, shuffle=True, random_state=64)\n", + "\n", + "# Standardizing the features\n", + "scaler = StandardScaler()\n", + "X_train_norm = scaler.fit_transform(X_train)\n", + "X_test_norm = scaler.transform(X_test)\n", + "\n", + "# Initial training of Logistic Regression model (without SMOTE)\n", + "log_reg = LogisticRegression(random_state=42)\n", + "log_reg.fit(X_train_norm, y_train)\n", + "\n", + "# Predicting class labels for testing data (initial prediction)\n", + "y_pred = log_reg.predict(X_test_norm)\n", + "\n", + "# Calculating classification report and balanced accuracy score (initial performance)\n", + "report = classification_report(y_test, y_pred)\n", + "balanced_accuracy = balanced_accuracy_score(y_test, y_pred)\n", + "\n", + "print('Initial Classification Report:')\n", + "print(report)\n", + "\n", + "print('\\nInitial Balanced Accuracy Score:')\n", + "print(balanced_accuracy)\n" + ] + }, + { + "cell_type": "markdown", + "id": "315e1a8f-b4d6-49f1-b7a3-704178f5074f", + "metadata": {}, + "source": [ + "### Random forest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5a51ba9-666c-4dfd-b370-a9f84555a35f", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import balanced_accuracy_score, classification_report\n", + "from sklearn.impute import SimpleImputer\n", + "import pandas as pd\n", + "\n", + "# Handling missing values in features\n", + "imputer = SimpleImputer(strategy='mean')\n", + "X_train_sc1 = imputer.fit_transform(X_train_sc1)\n", + "X_train_sc2 = imputer.fit_transform(X_train_sc2)\n", + "\n", + "# Handling missing values in targets\n", + "y_train1 = pd.Series(y_train1).fillna(pd.Series(y_train1).mode()[0])\n", + "y_train2 = pd.Series(y_train2).fillna(pd.Series(y_train2).mode()[0])\n", + "\n", + "# Defining hyperparameter grid for RandomForestClassifier\n", + "param_grid = {\n", + " 'n_estimators': [100, 200, 300],\n", + " 'max_depth': [2, 5, 10],\n", + " 'min_samples_split': [2, 5, 10],\n", + " 'min_samples_leaf': [1, 2, 4],\n", + " 'random_state': [42]\n", + "}\n", + "\n", + "# Instantiate RandomForestClassifier\n", + "model1 = RandomForestClassifier()\n", + "\n", + "# Creating GridSearchCV objects for dataset1 and dataset2\n", + "grid_search1 = GridSearchCV(estimator=model1, param_grid=param_grid, scoring='balanced_accuracy', cv=5)\n", + "grid_search2 = GridSearchCV(estimator=model1, param_grid=param_grid, scoring='balanced_accuracy', cv=5)\n", + "\n", + "# Fitting the GridSearchCV objects for dataset1 and dataset2\n", + "grid_search1.fit(X_train_sc1, y_train1)\n", + "grid_search2.fit(X_train_sc2, y_train2)\n", + "\n", + "# Getting the best parameters from GridSearchCV for dataset1 and dataset2\n", + "best_params1 = grid_search1.best_params_\n", + "best_params2 = grid_search2.best_params_\n", + "\n", + "# Printing the best parameters for each dataset\n", + "print(\"Best Parameters for Dataset 1:\", best_params1)\n", + "print(\"Best Parameters for Dataset 2:\", best_params2)\n", + "\n", + "# Getting the balanced accuracy for the best model from GridSearchCV for each dataset\n", + "best_model1 = RandomForestClassifier(**best_params1)\n", + "best_model1.fit(X_train_sc1, y_train1)\n", + "y_pred1 = best_model1.predict(X_test_sc1)\n", + "balanced_accuracy1 = balanced_accuracy_score(y_test1, y_pred1)\n", + "\n", + "best_model2 = RandomForestClassifier(**best_params2)\n", + "best_model2.fit(X_train_sc2, y_train2)\n", + "y_pred2 = best_model2.predict(X_test_sc2)\n", + "balanced_accuracy2 = balanced_accuracy_score(y_test2, y_pred2)\n", + "\n", + "# Printing the balanced accuracy for each dataset\n", + "print(\"Dataset 1 Balanced Accuracy:\", balanced_accuracy1)\n", + "print(\"Dataset 2 Balanced Accuracy:\", balanced_accuracy2)\n", + "\n", + "# Getting the classification report for the best model from GridSearchCV for each dataset\n", + "print(\"Dataset 1 Classification Report:\\n\", classification_report(y_test1, y_pred1))\n", + "print(\"Dataset 2 Classification Report:\\n\", classification_report(y_test2, y_pred2))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "da95f0ab-9167-4bdf-a459-27a1a5caa8c9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_logistic.py:1197: UserWarning: l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n", + "C:\\Users\\LLR User\\miniconda3\\envs\\MLE\\Lib\\site-packages\\sklearn\\linear_model\\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Parameters: {'C': 1, 'l1_ratio': 0.9, 'penalty': 'elasticnet', 'solver': 'saga'}\n", + "Balanced Accuracy: 0.671875\n", + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.87 0.91 0.89 64\n", + " 1.0 0.54 0.44 0.48 16\n", + "\n", + " accuracy 0.81 80\n", + " macro avg 0.70 0.67 0.68 80\n", + "weighted avg 0.80 0.81 0.80 80\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import balanced_accuracy_score, classification_report\n", + "\n", + "# Simplified hyperparameter grid\n", + "param_grid = {\n", + " 'penalty': ['l1', 'l2', 'elasticnet'], # Common penalties\n", + " 'solver': ['saga'], # 'saga' supports all penalties including 'elasticnet'\n", + " 'C': [0.01, 0.1, 1, 10], # Regularization strength\n", + " 'l1_ratio': [0.1, 0.5, 0.9] # Required only for 'elasticnet'\n", + "}\n", + "\n", + "# Instantiate Logistic Regression model\n", + "model = LogisticRegression(max_iter=1000, random_state=42) # Increased max_iter for convergence\n", + "\n", + "# Create GridSearchCV\n", + "grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='balanced_accuracy', cv=5)\n", + "\n", + "# Fit the GridSearchCV to the data\n", + "grid_search.fit(X_train_sc1, y_train1)\n", + "\n", + "# Get the best parameters and score\n", + "best_params = grid_search.best_params_\n", + "print(\"Best Parameters:\", best_params)\n", + "\n", + "# Evaluate on the test set\n", + "best_model = LogisticRegression(**best_params, max_iter=1000, random_state=42)\n", + "best_model.fit(X_train_sc1, y_train1)\n", + "y_pred = best_model.predict(X_test_sc1)\n", + "\n", + "# Balanced accuracy and classification report\n", + "balanced_accuracy = balanced_accuracy_score(y_test1, y_pred)\n", + "print(\"Balanced Accuracy:\", balanced_accuracy)\n", + "print(\"Classification Report:\\n\", classification_report(y_test1, y_pred))\n" + ] + }, + { + "cell_type": "markdown", + "id": "2f818089-499c-41bc-a8e7-1f79940f18a9", + "metadata": {}, + "source": [ + "### SVC" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "7e32d5dd-48fe-415a-8ab5-69a8c5716144", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset1: {'C': 100, 'gamma': 0.001, 'kernel': 'linear'}\n", + "Dataset2: {'C': 150, 'gamma': 0.01, 'kernel': 'rbf'}\n", + "correlation balanced accuracy: 0.640625\n", + "forward balanced accuracy: 0.546875\n", + "correlation classification report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.86 0.84 0.85 64\n", + " 1.0 0.41 0.44 0.42 16\n", + "\n", + " accuracy 0.76 80\n", + " macro avg 0.63 0.64 0.64 80\n", + "weighted avg 0.77 0.76 0.77 80\n", + "\n", + "forward classification report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.82 0.84 0.83 64\n", + " 1.0 0.29 0.25 0.27 16\n", + "\n", + " accuracy 0.72 80\n", + " macro avg 0.55 0.55 0.55 80\n", + "weighted avg 0.71 0.72 0.72 80\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.svm import SVC\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import balanced_accuracy_score, classification_report\n", + "\n", + "# Defining hyperparameter grid for SVC\n", + "param_grid = {\n", + " 'kernel': ['linear', 'rbf', 'poly'],\n", + " 'C': [0.01, 0.1, 1, 10, 100, 150],\n", + " 'gamma': [0.001, 0.01, 0.1, 1]\n", + "}\n", + "\n", + "# Instantiating SVC classifier \n", + "model1 = SVC()\n", + "\n", + "# Creating the GridSearchCV objects for dataset 1 and 2 (correlation and forward selection datasets)\n", + "grid_search1 = GridSearchCV(estimator=model1, param_grid=param_grid, scoring='balanced_accuracy', cv=5)\n", + "grid_search2 = GridSearchCV(estimator=model1, param_grid=param_grid, scoring='balanced_accuracy', cv=5)\n", + "\n", + "# Fitting the GridSearchCV objects for dataset 1 and 2\n", + "grid_search1.fit(X_train_sc1, y_train1)\n", + "grid_search2.fit(X_train_sc2, y_train2)\n", + "\n", + "# Getting the best parameters from GridSearchCV for dataset 1 and 2\n", + "best_params1 = grid_search1.best_params_\n", + "best_params2 = grid_search2.best_params_\n", + "\n", + "# Printing the best parameters for each dataset\n", + "print(\"Dataset1:\", best_params1)\n", + "print(\"Dataset2:\", best_params2)\n", + "\n", + "# Getting the balanced accuracy for the best model from GridSearchCV for each dataset\n", + "best_model1 = SVC(**best_params1)\n", + "best_model1.fit(X_train_sc1, y_train1)\n", + "y_pred1 = best_model1.predict(X_test_sc1)\n", + "balanced_accuracy1 = balanced_accuracy_score(y_test1, y_pred1)\n", + "\n", + "best_model2 = SVC(**best_params2)\n", + "best_model2.fit(X_train_sc2, y_train2)\n", + "y_pred2 = best_model2.predict(X_test_sc2)\n", + "balanced_accuracy2 = balanced_accuracy_score(y_test2, y_pred2)\n", + "\n", + "# Printing the balanced accuracy for each dataset\n", + "print(\"correlation balanced accuracy:\", balanced_accuracy1)\n", + "print(\"forward balanced accuracy:\", balanced_accuracy2)\n", + "\n", + "# Getting the classification report for the best model from GridSearchCV for each dataset\n", + "print(\"correlation classification report:\\n\", classification_report(y_test1, y_pred1))\n", + "print(\"forward classification report:\\n\", classification_report(y_test2, y_pred2))\n" + ] + }, + { + "cell_type": "markdown", + "id": "936900ac-9890-449e-a8e7-f1d7aef72777", + "metadata": {}, + "source": [ + "### Decision Tree" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "360fab3f-6fd3-488b-a6af-9432216714d1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset1: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'random_state': 42}\n", + "Dataset2: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 42}\n", + "correlation balanced accuracy: 0.515625\n", + "forward balanced accuracy: 0.53125\n", + "correlation classification report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.81 0.91 0.85 64\n", + " 1.0 0.25 0.12 0.17 16\n", + "\n", + " accuracy 0.75 80\n", + " macro avg 0.53 0.52 0.51 80\n", + "weighted avg 0.69 0.75 0.72 80\n", + "\n", + "forward classification report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.81 0.91 0.85 64\n", + " 1.0 0.25 0.12 0.17 16\n", + "\n", + " accuracy 0.75 80\n", + " macro avg 0.53 0.52 0.51 80\n", + "weighted avg 0.69 0.75 0.72 80\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "# Defining hyperparameter grid for DecisionTreeClassifier\n", + "param_grid = {\n", + " 'criterion': ['entropy', 'gini'],\n", + " 'max_depth': [2, 5, 10],\n", + " 'min_samples_split': [2, 5, 10],\n", + " 'min_samples_leaf': [1, 2, 4],\n", + " 'random_state': [42]\n", + "}\n", + "\n", + "# Instantiating DecisionTreeClassifier \n", + "model1 = DecisionTreeClassifier()\n", + "\n", + "# Creating GridSearchCV objects for dataset 1 and 2\n", + "grid_search1 = GridSearchCV(estimator=model1, param_grid=param_grid, scoring='balanced_accuracy', cv=5)\n", + "grid_search2 = GridSearchCV(estimator=model1, param_grid=param_grid, scoring='balanced_accuracy', cv=5)\n", + "\n", + "# Fitting the GridSearchCV objects for dataset 1 and 2\n", + "grid_search1.fit(X_train_sc1, y_train1)\n", + "grid_search2.fit(X_train_sc2, y_train2)\n", + "\n", + "# Getting the best parameters from GridSearchCV for dataset 1 and 2\n", + "best_params1 = grid_search1.best_params_\n", + "best_params2 = grid_search2.best_params_\n", + "\n", + "# Printting the best parameters for each dataset\n", + "print(\"Dataset1:\", best_params1)\n", + "print(\"Dataset2:\", best_params2)\n", + "\n", + "# Getting the balanced accuracy for the best model from GridSearchCV for each dataset\n", + "best_model1 = DecisionTreeClassifier(**best_params1)\n", + "best_model1.fit(X_train_sc1, y_train1)\n", + "y_pred1 = best_model1.predict(X_test_sc1)\n", + "balanced_accuracy1 = balanced_accuracy_score(y_test1, y_pred1)\n", + "\n", + "best_model2 = DecisionTreeClassifier(**best_params2)\n", + "best_model2.fit(X_train_sc2, y_train2)\n", + "y_pred2 = best_model2.predict(X_test_sc2)\n", + "balanced_accuracy2 = balanced_accuracy_score(y_test2, y_pred2)\n", + "\n", + "# Printting the balanced accuracy for each dataset\n", + "print(\"correlation balanced accuracy:\", balanced_accuracy1)\n", + "print(\"forward balanced accuracy:\", balanced_accuracy2)\n", + "\n", + "# Getting the classification report for the best model from GridSearchCV for each dataset\n", + "print(\"correlation classification report:\\n\", classification_report(y_test1, y_pred1))\n", + "print(\"forward classification report:\\n\", classification_report(y_test2, y_pred1))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "a26e6775-c605-4dd2-8ef2-5d89a255a8f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.79 1.00 0.88 94\n", + " 1.0 1.00 0.04 0.07 26\n", + "\n", + " accuracy 0.79 120\n", + " macro avg 0.89 0.52 0.48 120\n", + "weighted avg 0.84 0.79 0.71 120\n", + "\n", + "\n", + "Balanced Accuracy Score:\n", + "0.5192307692307693\n" + ] + } + ], + "source": [ + "# training the random forest model for correlated features dataset\n", + "X_train, X_test, y_train, y_test = train_test_split(X_corr, y_corr, test_size=0.3,shuffle= True, random_state=64)\n", + "scaler = StandardScaler()\n", + "X_train_norm = scaler.fit_transform(X_train)\n", + "X_test_norm = scaler.transform(X_test)\n", + "\n", + "# Creating and train Random Forest classifier with specific parameters got from gridsearch\n", + "rf_classifier = RandomForestClassifier(max_depth=10, min_samples_leaf=3, min_samples_split=15, n_estimators=1000, random_state=42)\n", + "rf_classifier.fit(X_train_norm, y_train)\n", + "\n", + "# Predicting class labels for testing data\n", + "y_pred = rf_classifier.predict(X_test_norm)\n", + "\n", + "# Calculating classification report and balanced accuracy score\n", + "classification_report = classification_report(y_test, y_pred)\n", + "balanced_accuracy = balanced_accuracy_score(y_test, y_pred)\n", + "\n", + "print('Classification Report:')\n", + "print(classification_report)\n", + "\n", + "print('\\nBalanced Accuracy Score:')\n", + "print(balanced_accuracy)\n" + ] + }, + { + "cell_type": "markdown", + "id": "b2491b03-0e49-4f7b-934c-300f6bfd5fa9", + "metadata": {}, + "source": [ + "### Smote for Random forest" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "95e1c8ab-bec2-4eee-a4d3-ff255ff2461d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Classification Reports for each fold:\n", + "\n", + "Fold 1 Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.84 0.97 0.90 33\n", + " 1.0 0.50 0.14 0.22 7\n", + "\n", + " accuracy 0.82 40\n", + " macro avg 0.67 0.56 0.56 40\n", + "weighted avg 0.78 0.82 0.78 40\n", + "\n", + "\n", + "Fold 2 Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.82 0.90 0.86 31\n", + " 1.0 0.50 0.33 0.40 9\n", + "\n", + " accuracy 0.78 40\n", + " macro avg 0.66 0.62 0.63 40\n", + "weighted avg 0.75 0.78 0.76 40\n", + "\n", + "\n", + "Fold 3 Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.79 0.97 0.87 31\n", + " 1.0 0.50 0.11 0.18 9\n", + "\n", + " accuracy 0.78 40\n", + " macro avg 0.64 0.54 0.53 40\n", + "weighted avg 0.72 0.78 0.71 40\n", + "\n", + "\n", + "Fold 4 Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.86 0.91 0.88 33\n", + " 1.0 0.40 0.29 0.33 7\n", + "\n", + " accuracy 0.80 40\n", + " macro avg 0.63 0.60 0.61 40\n", + "weighted avg 0.78 0.80 0.79 40\n", + "\n", + "\n", + "Fold 5 Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.79 0.79 0.79 29\n", + " 1.0 0.45 0.45 0.45 11\n", + "\n", + " accuracy 0.70 40\n", + " macro avg 0.62 0.62 0.62 40\n", + "weighted avg 0.70 0.70 0.70 40\n", + "\n", + "\n", + "Fold 6 Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.81 0.94 0.87 31\n", + " 1.0 0.50 0.22 0.31 9\n", + "\n", + " accuracy 0.78 40\n", + " macro avg 0.65 0.58 0.59 40\n", + "weighted avg 0.74 0.78 0.74 40\n", + "\n", + "\n", + "Fold 7 Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.78 1.00 0.88 29\n", + " 1.0 1.00 0.27 0.43 11\n", + "\n", + " accuracy 0.80 40\n", + " macro avg 0.89 0.64 0.65 40\n", + "weighted avg 0.84 0.80 0.75 40\n", + "\n", + "\n", + "Fold 8 Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.94 0.86 0.90 37\n", + " 1.0 0.17 0.33 0.22 3\n", + "\n", + " accuracy 0.82 40\n", + " macro avg 0.55 0.60 0.56 40\n", + "weighted avg 0.88 0.82 0.85 40\n", + "\n", + "\n", + "Fold 9 Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.83 0.94 0.88 32\n", + " 1.0 0.50 0.25 0.33 8\n", + "\n", + " accuracy 0.80 40\n", + " macro avg 0.67 0.59 0.61 40\n", + "weighted avg 0.77 0.80 0.77 40\n", + "\n", + "\n", + "Fold 10 Classification Report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.84 0.90 0.87 30\n", + " 1.0 0.62 0.50 0.56 10\n", + "\n", + " accuracy 0.80 40\n", + " macro avg 0.73 0.70 0.71 40\n", + "weighted avg 0.79 0.80 0.79 40\n", + "\n", + "\n", + "Balanced Accuracy Scores for each fold:\n", + "Fold 1 Balanced Accuracy: 0.5563\n", + "Fold 2 Balanced Accuracy: 0.6183\n", + "Fold 3 Balanced Accuracy: 0.5394\n", + "Fold 4 Balanced Accuracy: 0.5974\n", + "Fold 5 Balanced Accuracy: 0.6238\n", + "Fold 6 Balanced Accuracy: 0.5789\n", + "Fold 7 Balanced Accuracy: 0.6364\n", + "Fold 8 Balanced Accuracy: 0.5991\n", + "Fold 9 Balanced Accuracy: 0.5938\n", + "Fold 10 Balanced Accuracy: 0.7000\n", + "\n", + "Mean Balanced Accuracy Score across all folds: 0.6043275980337994\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import KFold\n", + "from imblearn.over_sampling import SMOTE\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report as clf_report, balanced_accuracy_score\n", + "import numpy as np\n", + "\n", + "# Initialize KFold with 10 splits\n", + "k = KFold(n_splits=10, shuffle=True, random_state=42)\n", + "\n", + "# Initialize SMOTE\n", + "sm = SMOTE(random_state=42)\n", + "\n", + "# Initialize Random Forest model\n", + "rf_classifier = RandomForestClassifier(max_depth=10, min_samples_leaf=3, min_samples_split=10, n_estimators=1000, random_state=42)\n", + "\n", + "# Lists to hold the results\n", + "all_classification_reports = []\n", + "all_balanced_accuracies = []\n", + "\n", + "# K-Fold Cross-Validation Loop\n", + "for train_idx, test_idx in k.split(X_corr, y_corr):\n", + " # Split the data into train and test based on the indices\n", + " X_train, X_test = X_corr.iloc[train_idx], X_corr.iloc[test_idx]\n", + " y_train, y_test = y_corr.iloc[train_idx], y_corr.iloc[test_idx]\n", + " \n", + " # Resampling the data using SMOTE only for the training data\n", + " X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)\n", + " \n", + " # Scaling data with StandardScaler\n", + " scaler = StandardScaler()\n", + " X_train_norm = scaler.fit_transform(X_train_resampled)\n", + " X_test_norm = scaler.transform(X_test) # Only transform test set\n", + " \n", + " # Train the Random Forest model\n", + " rf_classifier.fit(X_train_norm, y_train_resampled)\n", + " \n", + " # Predicting\n", + " y_pred = rf_classifier.predict(X_test_norm)\n", + " \n", + " # Generating the classification report and balanced accuracy score for each fold\n", + " report = clf_report(y_test, y_pred, zero_division=0)\n", + " balanced_accuracy = balanced_accuracy_score(y_test, y_pred)\n", + " \n", + " # Append results\n", + " all_classification_reports.append(report)\n", + " all_balanced_accuracies.append(balanced_accuracy)\n", + "\n", + "# Displaying the results\n", + "print(\"\\nClassification Reports for each fold:\")\n", + "for i, report in enumerate(all_classification_reports, 1):\n", + " print(f\"\\nFold {i} Classification Report:\")\n", + " print(report)\n", + "\n", + "print(\"\\nBalanced Accuracy Scores for each fold:\")\n", + "for i, score in enumerate(all_balanced_accuracies, 1):\n", + " print(f\"Fold {i} Balanced Accuracy: {score:.4f}\")\n", + "\n", + "# Optionally, you can also display the mean of the balanced accuracies\n", + "print(\"\\nMean Balanced Accuracy Score across all folds:\", np.mean(all_balanced_accuracies))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b182154-61c7-48ec-b601-7edf2537fd0c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}