[e6e569]: / FeatureSelection / feature_selection_classification.ipynb

Download this file

384 lines (383 with data), 12.6 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "\n",
    "# Add the parent directory to the system path\n",
    "sys.path.append(os.path.abspath('../'))  # Adjust the path as needed\n",
    "\n",
    "from my_util import df_to_corr_matrix\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
    "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
    "\n",
    "\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "from pickle import dump , load"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read data\n",
    "training_file = \"../TrainDataset2024.xls\"\n",
    "\n",
    "data = pd.read_excel(training_file)\n",
    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
    "data_no_na = data.replace(999, np.nan)\n",
    "data_no_na.dropna(ignore_index=True, inplace=True)\n",
    "X = data_no_na.drop('pCR (outcome)', axis=1)\n",
    "y = data_no_na['pCR (outcome)']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop highly correlated features\n",
    "CORR_THRESHOLD = 0.8\n",
    "# Create a correlation matrix\n",
    "correlation_matrix = X.corr()\n",
    "\n",
    "highly_correlated_features = set()\n",
    "\n",
    "for i in range(len(correlation_matrix.columns)):\n",
    "  for j in range(i):\n",
    "    if abs(correlation_matrix.iloc[i, j]) > CORR_THRESHOLD:\n",
    "        highly_correlated_features.add(correlation_matrix.columns[i])\n",
    "\n",
    "X_no_highly_correlated = X.drop(columns=highly_correlated_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = StandardScaler()\n",
    "Xs = scaler.fit_transform(X_no_highly_correlated)\n",
    "Xs = pd.DataFrame(Xs, columns=X_no_highly_correlated.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pCR (outcome)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Gene</th>\n",
       "      <td>0.419255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HER2</th>\n",
       "      <td>0.257349</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PgR</th>\n",
       "      <td>0.213667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ER</th>\n",
       "      <td>0.182310</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_firstorder_10Percentile</th>\n",
       "      <td>0.154003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_ngtdm_Busyness</th>\n",
       "      <td>0.132275</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LNStatus</th>\n",
       "      <td>0.128529</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TumourStage</th>\n",
       "      <td>0.113840</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_gldm_DependenceEntropy</th>\n",
       "      <td>0.109880</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_firstorder_Skewness</th>\n",
       "      <td>0.107543</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_glrlm_ShortRunHighGrayLevelEmphasis</th>\n",
       "      <td>0.086085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_ngtdm_Strength</th>\n",
       "      <td>0.082920</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_gldm_SmallDependenceEmphasis</th>\n",
       "      <td>0.077261</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_firstorder_InterquartileRange</th>\n",
       "      <td>0.071577</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_shape_MajorAxisLength</th>\n",
       "      <td>0.063093</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_glrlm_LongRunLowGrayLevelEmphasis</th>\n",
       "      <td>0.062274</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_firstorder_Minimum</th>\n",
       "      <td>0.060708</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HistologyType</th>\n",
       "      <td>0.053591</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ChemoGrade</th>\n",
       "      <td>0.053219</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_shape_Maximum2DDiameterRow</th>\n",
       "      <td>0.053068</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_shape_Maximum2DDiameterColumn</th>\n",
       "      <td>0.050781</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_shape_SurfaceVolumeRatio</th>\n",
       "      <td>0.047749</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_shape_LeastAxisLength</th>\n",
       "      <td>0.038322</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_glcm_Autocorrelation</th>\n",
       "      <td>0.033802</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_shape_Sphericity</th>\n",
       "      <td>0.032420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_glszm_SizeZoneNonUniformityNormalized</th>\n",
       "      <td>0.031342</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_glszm_SmallAreaEmphasis</th>\n",
       "      <td>0.028493</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_shape_Elongation</th>\n",
       "      <td>0.027240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_firstorder_Kurtosis</th>\n",
       "      <td>0.026213</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_glszm_GrayLevelNonUniformity</th>\n",
       "      <td>0.020245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TrippleNegative</th>\n",
       "      <td>0.016910</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_glcm_Imc1</th>\n",
       "      <td>0.016507</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_firstorder_90Percentile</th>\n",
       "      <td>0.016038</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_glcm_Correlation</th>\n",
       "      <td>0.007009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>original_glszm_GrayLevelNonUniformityNormalized</th>\n",
       "      <td>0.003004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Age</th>\n",
       "      <td>0.000357</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 pCR (outcome)\n",
       "Gene                                                  0.419255\n",
       "HER2                                                  0.257349\n",
       "PgR                                                   0.213667\n",
       "ER                                                    0.182310\n",
       "original_firstorder_10Percentile                      0.154003\n",
       "original_ngtdm_Busyness                               0.132275\n",
       "LNStatus                                              0.128529\n",
       "TumourStage                                           0.113840\n",
       "original_gldm_DependenceEntropy                       0.109880\n",
       "original_firstorder_Skewness                          0.107543\n",
       "original_glrlm_ShortRunHighGrayLevelEmphasis          0.086085\n",
       "original_ngtdm_Strength                               0.082920\n",
       "original_gldm_SmallDependenceEmphasis                 0.077261\n",
       "original_firstorder_InterquartileRange                0.071577\n",
       "original_shape_MajorAxisLength                        0.063093\n",
       "original_glrlm_LongRunLowGrayLevelEmphasis            0.062274\n",
       "original_firstorder_Minimum                           0.060708\n",
       "HistologyType                                         0.053591\n",
       "ChemoGrade                                            0.053219\n",
       "original_shape_Maximum2DDiameterRow                   0.053068\n",
       "original_shape_Maximum2DDiameterColumn                0.050781\n",
       "original_shape_SurfaceVolumeRatio                     0.047749\n",
       "original_shape_LeastAxisLength                        0.038322\n",
       "original_glcm_Autocorrelation                         0.033802\n",
       "original_shape_Sphericity                             0.032420\n",
       "original_glszm_SizeZoneNonUniformityNormalized        0.031342\n",
       "original_glszm_SmallAreaEmphasis                      0.028493\n",
       "original_shape_Elongation                             0.027240\n",
       "original_firstorder_Kurtosis                          0.026213\n",
       "original_glszm_GrayLevelNonUniformity                 0.020245\n",
       "TrippleNegative                                       0.016910\n",
       "original_glcm_Imc1                                    0.016507\n",
       "original_firstorder_90Percentile                      0.016038\n",
       "original_glcm_Correlation                             0.007009\n",
       "original_glszm_GrayLevelNonUniformityNormalized       0.003004\n",
       "Age                                                   0.000357"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.concat([y, Xs], axis=1)\n",
    "\n",
    "corr = df.corr()[[\"pCR (outcome)\"]]\n",
    "\n",
    "corr.drop([\"pCR (outcome)\"], inplace=True)\n",
    "\n",
    "corr[\"pCR (outcome)\"] = abs(corr[\"pCR (outcome)\"])\n",
    "\n",
    "sorted = corr.sort_values(by=\"pCR (outcome)\", ascending=False)\n",
    "\n",
    "sorted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "36\n"
     ]
    }
   ],
   "source": [
    "print(len(sorted))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved pkl/corr_5_selected_features.pkl\n",
      "Saved pkl/corr_10_selected_features.pkl\n",
      "Saved pkl/corr_15_selected_features.pkl\n",
      "Saved pkl/corr_20_selected_features.pkl\n",
      "Saved pkl/corr_25_selected_features.pkl\n",
      "Saved pkl/corr_30_selected_features.pkl\n",
      "Saved pkl/corr_35_selected_features.pkl\n"
     ]
    }
   ],
   "source": [
    "num_of_features_list = [5, 10, 15, 20, 25, 30, 35]\n",
    "\n",
    "for n in num_of_features_list:\n",
    "  with open(f\"pkl/corr_{n}_selected_features.pkl\", 'wb') as file:\n",
    "    dump(list(sorted[:n].index), file)\n",
    "    print(f\"Saved {file.name}\")\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MLEAsm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}