384 lines (383 with data), 12.6 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"\n",
"# Add the parent directory to the system path\n",
"sys.path.append(os.path.abspath('../')) # Adjust the path as needed\n",
"\n",
"from my_util import df_to_corr_matrix\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from sklearn.preprocessing import StandardScaler, RobustScaler\n",
"from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
"\n",
"\n",
"from joblib import Parallel, delayed\n",
"\n",
"from pickle import dump , load"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Read data\n",
"training_file = \"../TrainDataset2024.xls\"\n",
"\n",
"data = pd.read_excel(training_file)\n",
"data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
"data_no_na = data.replace(999, np.nan)\n",
"data_no_na.dropna(ignore_index=True, inplace=True)\n",
"X = data_no_na.drop('pCR (outcome)', axis=1)\n",
"y = data_no_na['pCR (outcome)']"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Drop highly correlated features\n",
"CORR_THRESHOLD = 0.8\n",
"# Create a correlation matrix\n",
"correlation_matrix = X.corr()\n",
"\n",
"highly_correlated_features = set()\n",
"\n",
"for i in range(len(correlation_matrix.columns)):\n",
" for j in range(i):\n",
" if abs(correlation_matrix.iloc[i, j]) > CORR_THRESHOLD:\n",
" highly_correlated_features.add(correlation_matrix.columns[i])\n",
"\n",
"X_no_highly_correlated = X.drop(columns=highly_correlated_features)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler()\n",
"Xs = scaler.fit_transform(X_no_highly_correlated)\n",
"Xs = pd.DataFrame(Xs, columns=X_no_highly_correlated.columns)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>pCR (outcome)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Gene</th>\n",
" <td>0.419255</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HER2</th>\n",
" <td>0.257349</td>\n",
" </tr>\n",
" <tr>\n",
" <th>PgR</th>\n",
" <td>0.213667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ER</th>\n",
" <td>0.182310</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_firstorder_10Percentile</th>\n",
" <td>0.154003</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_ngtdm_Busyness</th>\n",
" <td>0.132275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>LNStatus</th>\n",
" <td>0.128529</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TumourStage</th>\n",
" <td>0.113840</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_gldm_DependenceEntropy</th>\n",
" <td>0.109880</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_firstorder_Skewness</th>\n",
" <td>0.107543</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glrlm_ShortRunHighGrayLevelEmphasis</th>\n",
" <td>0.086085</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_ngtdm_Strength</th>\n",
" <td>0.082920</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_gldm_SmallDependenceEmphasis</th>\n",
" <td>0.077261</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_firstorder_InterquartileRange</th>\n",
" <td>0.071577</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_MajorAxisLength</th>\n",
" <td>0.063093</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glrlm_LongRunLowGrayLevelEmphasis</th>\n",
" <td>0.062274</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_firstorder_Minimum</th>\n",
" <td>0.060708</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HistologyType</th>\n",
" <td>0.053591</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ChemoGrade</th>\n",
" <td>0.053219</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_Maximum2DDiameterRow</th>\n",
" <td>0.053068</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_Maximum2DDiameterColumn</th>\n",
" <td>0.050781</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_SurfaceVolumeRatio</th>\n",
" <td>0.047749</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_LeastAxisLength</th>\n",
" <td>0.038322</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glcm_Autocorrelation</th>\n",
" <td>0.033802</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_Sphericity</th>\n",
" <td>0.032420</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glszm_SizeZoneNonUniformityNormalized</th>\n",
" <td>0.031342</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glszm_SmallAreaEmphasis</th>\n",
" <td>0.028493</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_Elongation</th>\n",
" <td>0.027240</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_firstorder_Kurtosis</th>\n",
" <td>0.026213</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glszm_GrayLevelNonUniformity</th>\n",
" <td>0.020245</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TrippleNegative</th>\n",
" <td>0.016910</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glcm_Imc1</th>\n",
" <td>0.016507</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_firstorder_90Percentile</th>\n",
" <td>0.016038</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glcm_Correlation</th>\n",
" <td>0.007009</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glszm_GrayLevelNonUniformityNormalized</th>\n",
" <td>0.003004</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Age</th>\n",
" <td>0.000357</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" pCR (outcome)\n",
"Gene 0.419255\n",
"HER2 0.257349\n",
"PgR 0.213667\n",
"ER 0.182310\n",
"original_firstorder_10Percentile 0.154003\n",
"original_ngtdm_Busyness 0.132275\n",
"LNStatus 0.128529\n",
"TumourStage 0.113840\n",
"original_gldm_DependenceEntropy 0.109880\n",
"original_firstorder_Skewness 0.107543\n",
"original_glrlm_ShortRunHighGrayLevelEmphasis 0.086085\n",
"original_ngtdm_Strength 0.082920\n",
"original_gldm_SmallDependenceEmphasis 0.077261\n",
"original_firstorder_InterquartileRange 0.071577\n",
"original_shape_MajorAxisLength 0.063093\n",
"original_glrlm_LongRunLowGrayLevelEmphasis 0.062274\n",
"original_firstorder_Minimum 0.060708\n",
"HistologyType 0.053591\n",
"ChemoGrade 0.053219\n",
"original_shape_Maximum2DDiameterRow 0.053068\n",
"original_shape_Maximum2DDiameterColumn 0.050781\n",
"original_shape_SurfaceVolumeRatio 0.047749\n",
"original_shape_LeastAxisLength 0.038322\n",
"original_glcm_Autocorrelation 0.033802\n",
"original_shape_Sphericity 0.032420\n",
"original_glszm_SizeZoneNonUniformityNormalized 0.031342\n",
"original_glszm_SmallAreaEmphasis 0.028493\n",
"original_shape_Elongation 0.027240\n",
"original_firstorder_Kurtosis 0.026213\n",
"original_glszm_GrayLevelNonUniformity 0.020245\n",
"TrippleNegative 0.016910\n",
"original_glcm_Imc1 0.016507\n",
"original_firstorder_90Percentile 0.016038\n",
"original_glcm_Correlation 0.007009\n",
"original_glszm_GrayLevelNonUniformityNormalized 0.003004\n",
"Age 0.000357"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.concat([y, Xs], axis=1)\n",
"\n",
"corr = df.corr()[[\"pCR (outcome)\"]]\n",
"\n",
"corr.drop([\"pCR (outcome)\"], inplace=True)\n",
"\n",
"corr[\"pCR (outcome)\"] = abs(corr[\"pCR (outcome)\"])\n",
"\n",
"sorted = corr.sort_values(by=\"pCR (outcome)\", ascending=False)\n",
"\n",
"sorted"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"36\n"
]
}
],
"source": [
"print(len(sorted))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saved pkl/corr_5_selected_features.pkl\n",
"Saved pkl/corr_10_selected_features.pkl\n",
"Saved pkl/corr_15_selected_features.pkl\n",
"Saved pkl/corr_20_selected_features.pkl\n",
"Saved pkl/corr_25_selected_features.pkl\n",
"Saved pkl/corr_30_selected_features.pkl\n",
"Saved pkl/corr_35_selected_features.pkl\n"
]
}
],
"source": [
"num_of_features_list = [5, 10, 15, 20, 25, 30, 35]\n",
"\n",
"for n in num_of_features_list:\n",
" with open(f\"pkl/corr_{n}_selected_features.pkl\", 'wb') as file:\n",
" dump(list(sorted[:n].index), file)\n",
" print(f\"Saved {file.name}\")\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "MLEAsm",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}