443 lines (442 with data), 20.5 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"\n",
"# Add the parent directory to the system path\n",
"sys.path.append(os.path.abspath('../')) # Adjust the path as needed\n",
"\n",
"from my_util import df_to_corr_matrix\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from sklearn.preprocessing import StandardScaler, RobustScaler\n",
"from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
"\n",
"\n",
"from joblib import Parallel, delayed\n",
"\n",
"from pickle import dump , load"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Read data\n",
"training_file = \"../TrainDataset2024.xls\"\n",
"\n",
"\n",
"data = pd.read_excel(training_file)\n",
"data.drop([\"ID\", \"pCR (outcome)\"], axis=1, inplace=True)\n",
"data_no_na = data.replace(999, np.nan)\n",
"data_no_na.dropna(ignore_index=True, inplace=True)\n",
"X = data_no_na.drop('RelapseFreeSurvival (outcome)', axis=1)\n",
"y = data_no_na['RelapseFreeSurvival (outcome)']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Drop highly correlated features\n",
"CORR_THRESHOLD = 0.8\n",
"# Create a correlation matrix\n",
"correlation_matrix = X.corr()\n",
"\n",
"highly_correlated_features = set()\n",
"\n",
"for i in range(len(correlation_matrix.columns)):\n",
" for j in range(i):\n",
" if abs(correlation_matrix.iloc[i, j]) > CORR_THRESHOLD:\n",
" highly_correlated_features.add(correlation_matrix.columns[i])\n",
"\n",
"X_no_highly_correlated = X.drop(columns=highly_correlated_features)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler()\n",
"Xs = scaler.fit_transform(X_no_highly_correlated)\n",
"Xs = pd.DataFrame(Xs, columns=X_no_highly_correlated.columns)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>RelapseFreeSurvival (outcome)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>original_firstorder_InterquartileRange</th>\n",
" <td>0.179408</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_firstorder_Kurtosis</th>\n",
" <td>0.146722</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TumourStage</th>\n",
" <td>0.140490</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_MajorAxisLength</th>\n",
" <td>0.133172</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_firstorder_90Percentile</th>\n",
" <td>0.131621</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ChemoGrade</th>\n",
" <td>0.118109</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HER2</th>\n",
" <td>0.114470</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_Maximum2DDiameterRow</th>\n",
" <td>0.110054</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_LeastAxisLength</th>\n",
" <td>0.093966</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_Maximum2DDiameterColumn</th>\n",
" <td>0.093880</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glszm_SmallAreaEmphasis</th>\n",
" <td>0.085894</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Age</th>\n",
" <td>0.076094</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_Sphericity</th>\n",
" <td>0.072807</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_firstorder_10Percentile</th>\n",
" <td>0.071754</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glszm_SizeZoneNonUniformityNormalized</th>\n",
" <td>0.071593</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_gldm_DependenceEntropy</th>\n",
" <td>0.063785</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_ngtdm_Busyness</th>\n",
" <td>0.063398</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glcm_Imc1</th>\n",
" <td>0.056633</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Gene</th>\n",
" <td>0.052832</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_gldm_SmallDependenceEmphasis</th>\n",
" <td>0.051927</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glszm_GrayLevelNonUniformityNormalized</th>\n",
" <td>0.043586</td>\n",
" </tr>\n",
" <tr>\n",
" <th>PgR</th>\n",
" <td>0.042124</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TrippleNegative</th>\n",
" <td>0.038812</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_Elongation</th>\n",
" <td>0.036772</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glcm_Correlation</th>\n",
" <td>0.035816</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_firstorder_Minimum</th>\n",
" <td>0.034335</td>\n",
" </tr>\n",
" <tr>\n",
" <th>HistologyType</th>\n",
" <td>0.032645</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glrlm_ShortRunHighGrayLevelEmphasis</th>\n",
" <td>0.031545</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_firstorder_Skewness</th>\n",
" <td>0.030369</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glrlm_LongRunLowGrayLevelEmphasis</th>\n",
" <td>0.016570</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ER</th>\n",
" <td>0.015475</td>\n",
" </tr>\n",
" <tr>\n",
" <th>LNStatus</th>\n",
" <td>0.008145</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_shape_SurfaceVolumeRatio</th>\n",
" <td>0.006105</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glszm_GrayLevelNonUniformity</th>\n",
" <td>0.005831</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_glcm_Autocorrelation</th>\n",
" <td>0.005342</td>\n",
" </tr>\n",
" <tr>\n",
" <th>original_ngtdm_Strength</th>\n",
" <td>0.002199</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" RelapseFreeSurvival (outcome)\n",
"original_firstorder_InterquartileRange 0.179408\n",
"original_firstorder_Kurtosis 0.146722\n",
"TumourStage 0.140490\n",
"original_shape_MajorAxisLength 0.133172\n",
"original_firstorder_90Percentile 0.131621\n",
"ChemoGrade 0.118109\n",
"HER2 0.114470\n",
"original_shape_Maximum2DDiameterRow 0.110054\n",
"original_shape_LeastAxisLength 0.093966\n",
"original_shape_Maximum2DDiameterColumn 0.093880\n",
"original_glszm_SmallAreaEmphasis 0.085894\n",
"Age 0.076094\n",
"original_shape_Sphericity 0.072807\n",
"original_firstorder_10Percentile 0.071754\n",
"original_glszm_SizeZoneNonUniformityNormalized 0.071593\n",
"original_gldm_DependenceEntropy 0.063785\n",
"original_ngtdm_Busyness 0.063398\n",
"original_glcm_Imc1 0.056633\n",
"Gene 0.052832\n",
"original_gldm_SmallDependenceEmphasis 0.051927\n",
"original_glszm_GrayLevelNonUniformityNormalized 0.043586\n",
"PgR 0.042124\n",
"TrippleNegative 0.038812\n",
"original_shape_Elongation 0.036772\n",
"original_glcm_Correlation 0.035816\n",
"original_firstorder_Minimum 0.034335\n",
"HistologyType 0.032645\n",
"original_glrlm_ShortRunHighGrayLevelEmphasis 0.031545\n",
"original_firstorder_Skewness 0.030369\n",
"original_glrlm_LongRunLowGrayLevelEmphasis 0.016570\n",
"ER 0.015475\n",
"LNStatus 0.008145\n",
"original_shape_SurfaceVolumeRatio 0.006105\n",
"original_glszm_GrayLevelNonUniformity 0.005831\n",
"original_glcm_Autocorrelation 0.005342\n",
"original_ngtdm_Strength 0.002199"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.concat([y, Xs], axis=1)\n",
"\n",
"corr = df.corr()[[\"RelapseFreeSurvival (outcome)\"]]\n",
"\n",
"corr.drop([\"RelapseFreeSurvival (outcome)\"], inplace=True)\n",
"\n",
"corr[\"RelapseFreeSurvival (outcome)\"] = abs(corr[\"RelapseFreeSurvival (outcome)\"])\n",
"\n",
"sorted = corr.sort_values(by=\"RelapseFreeSurvival (outcome)\", ascending=False)\n",
"\n",
"sorted"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" RelapseFreeSurvival (outcome)\n",
"original_firstorder_InterquartileRange 0.179408\n",
"original_firstorder_Kurtosis 0.146722\n",
"TumourStage 0.140490\n",
"original_shape_MajorAxisLength 0.133172\n",
"original_firstorder_90Percentile 0.131621\n",
"ChemoGrade 0.118109\n",
"original_shape_Maximum2DDiameterRow 0.110054\n",
"original_shape_LeastAxisLength 0.093966\n",
"original_shape_Maximum2DDiameterColumn 0.093880\n",
"original_glszm_SmallAreaEmphasis 0.085894\n",
"Age 0.076094\n",
"original_shape_Sphericity 0.072807\n",
"original_firstorder_10Percentile 0.071754\n",
"original_glszm_SizeZoneNonUniformityNormalized 0.071593\n",
"original_gldm_DependenceEntropy 0.063785\n",
"original_ngtdm_Busyness 0.063398\n",
"original_glcm_Imc1 0.056633\n",
"original_gldm_SmallDependenceEmphasis 0.051927\n",
"original_glszm_GrayLevelNonUniformityNormalized 0.043586\n",
"PgR 0.042124\n",
"TrippleNegative 0.038812\n",
"original_shape_Elongation 0.036772\n",
"original_glcm_Correlation 0.035816\n",
"original_firstorder_Minimum 0.034335\n",
"HistologyType 0.032645\n",
"original_glrlm_ShortRunHighGrayLevelEmphasis 0.031545\n",
"original_firstorder_Skewness 0.030369\n",
"original_glrlm_LongRunLowGrayLevelEmphasis 0.016570\n",
"LNStatus 0.008145\n",
"original_shape_SurfaceVolumeRatio 0.006105\n",
"original_glszm_GrayLevelNonUniformity 0.005831\n",
"original_glcm_Autocorrelation 0.005342\n",
"original_ngtdm_Strength 0.002199\n"
]
}
],
"source": [
"print(sorted.drop(['ER', 'HER2', 'Gene']))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"36\n"
]
}
],
"source": [
"print(len(sorted))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ER', 'HER2', 'Gene', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis']\n",
"Saved pkl/regression_features_corr_5_selected_features.pkl\n",
"['ER', 'HER2', 'Gene', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'TumourStage', 'original_shape_MajorAxisLength', 'original_firstorder_90Percentile', 'ChemoGrade', 'HER2']\n",
"Saved pkl/regression_features_corr_10_selected_features.pkl\n",
"['ER', 'HER2', 'Gene', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'TumourStage', 'original_shape_MajorAxisLength', 'original_firstorder_90Percentile', 'ChemoGrade', 'HER2', 'original_shape_Maximum2DDiameterRow', 'original_shape_LeastAxisLength', 'original_shape_Maximum2DDiameterColumn', 'original_glszm_SmallAreaEmphasis', 'Age']\n",
"Saved pkl/regression_features_corr_15_selected_features.pkl\n",
"['ER', 'HER2', 'Gene', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'TumourStage', 'original_shape_MajorAxisLength', 'original_firstorder_90Percentile', 'ChemoGrade', 'HER2', 'original_shape_Maximum2DDiameterRow', 'original_shape_LeastAxisLength', 'original_shape_Maximum2DDiameterColumn', 'original_glszm_SmallAreaEmphasis', 'Age', 'original_shape_Sphericity', 'original_firstorder_10Percentile', 'original_glszm_SizeZoneNonUniformityNormalized', 'original_gldm_DependenceEntropy', 'original_ngtdm_Busyness']\n",
"Saved pkl/regression_features_corr_20_selected_features.pkl\n",
"['ER', 'HER2', 'Gene', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'TumourStage', 'original_shape_MajorAxisLength', 'original_firstorder_90Percentile', 'ChemoGrade', 'HER2', 'original_shape_Maximum2DDiameterRow', 'original_shape_LeastAxisLength', 'original_shape_Maximum2DDiameterColumn', 'original_glszm_SmallAreaEmphasis', 'Age', 'original_shape_Sphericity', 'original_firstorder_10Percentile', 'original_glszm_SizeZoneNonUniformityNormalized', 'original_gldm_DependenceEntropy', 'original_ngtdm_Busyness', 'original_glcm_Imc1', 'Gene', 'original_gldm_SmallDependenceEmphasis', 'original_glszm_GrayLevelNonUniformityNormalized', 'PgR']\n",
"Saved pkl/regression_features_corr_25_selected_features.pkl\n",
"['ER', 'HER2', 'Gene', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'TumourStage', 'original_shape_MajorAxisLength', 'original_firstorder_90Percentile', 'ChemoGrade', 'HER2', 'original_shape_Maximum2DDiameterRow', 'original_shape_LeastAxisLength', 'original_shape_Maximum2DDiameterColumn', 'original_glszm_SmallAreaEmphasis', 'Age', 'original_shape_Sphericity', 'original_firstorder_10Percentile', 'original_glszm_SizeZoneNonUniformityNormalized', 'original_gldm_DependenceEntropy', 'original_ngtdm_Busyness', 'original_glcm_Imc1', 'Gene', 'original_gldm_SmallDependenceEmphasis', 'original_glszm_GrayLevelNonUniformityNormalized', 'PgR', 'TrippleNegative', 'original_shape_Elongation', 'original_glcm_Correlation', 'original_firstorder_Minimum', 'HistologyType']\n",
"Saved pkl/regression_features_corr_30_selected_features.pkl\n",
"['ER', 'HER2', 'Gene', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'TumourStage', 'original_shape_MajorAxisLength', 'original_firstorder_90Percentile', 'ChemoGrade', 'HER2', 'original_shape_Maximum2DDiameterRow', 'original_shape_LeastAxisLength', 'original_shape_Maximum2DDiameterColumn', 'original_glszm_SmallAreaEmphasis', 'Age', 'original_shape_Sphericity', 'original_firstorder_10Percentile', 'original_glszm_SizeZoneNonUniformityNormalized', 'original_gldm_DependenceEntropy', 'original_ngtdm_Busyness', 'original_glcm_Imc1', 'Gene', 'original_gldm_SmallDependenceEmphasis', 'original_glszm_GrayLevelNonUniformityNormalized', 'PgR', 'TrippleNegative', 'original_shape_Elongation', 'original_glcm_Correlation', 'original_firstorder_Minimum', 'HistologyType', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_firstorder_Skewness', 'original_glrlm_LongRunLowGrayLevelEmphasis', 'ER', 'LNStatus']\n",
"Saved pkl/regression_features_corr_35_selected_features.pkl\n"
]
}
],
"source": [
"num_of_features_list = [5, 10, 15, 20, 25, 30, 35]\n",
"\n",
"for n in num_of_features_list:\n",
" with open(f\"pkl/regression_features_corr_{n}_selected_features.pkl\", 'wb') as file:\n",
" print(['ER', 'HER2', 'Gene'] + list(sorted[:n-3].index))\n",
" dump(list(sorted[:n].index), file)\n",
" print(f\"Saved {file.name}\")\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "MLEAsm",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}