1303 lines (1302 with data), 47.0 kB
{
"cells": [
{
"cell_type": "markdown",
"id": "0e877a17",
"metadata": {},
"source": [
"**TEST CODE REGRESSION**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a5167c85",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"import numpy as np\n",
"import warnings\n",
"from sklearn.svm import SVR \n",
"from sklearn.metrics import mean_absolute_error\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"import pickle\n",
"\n",
"warnings.filterwarnings('ignore') #ignore warning messages"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "8e81f387",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Enter the file path: TestTrialDataset.xls\n"
]
}
],
"source": [
"# Get the file path from the user\n",
"file_path = input(\"Enter the file path: \")\n",
"\n",
"# Check the file extension\n",
"if file_path.endswith('.csv'):\n",
" # Read CSV file\n",
" df = pd.read_csv(file_path)\n",
"elif file_path.endswith('.xls') or file_path.endswith('.xlsx'):\n",
" # Read XLS/XLSX file\n",
" df = pd.read_excel(file_path)\n",
"else:\n",
" print(\"Invalid file format. Please provide a CSV or XLS/XLSX file.\")\n",
" exit()"
]
},
{
"cell_type": "markdown",
"id": "f712f600",
"metadata": {},
"source": [
"**Data analysis and preprocessing**"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7d90d850",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Age</th>\n",
" <th>ER</th>\n",
" <th>PgR</th>\n",
" <th>HER2</th>\n",
" <th>TrippleNegative</th>\n",
" <th>ChemoGrade</th>\n",
" <th>Proliferation</th>\n",
" <th>HistologyType</th>\n",
" <th>LNStatus</th>\n",
" <th>...</th>\n",
" <th>original_glszm_SmallAreaHighGrayLevelEmphasis</th>\n",
" <th>original_glszm_SmallAreaLowGrayLevelEmphasis</th>\n",
" <th>original_glszm_ZoneEntropy</th>\n",
" <th>original_glszm_ZonePercentage</th>\n",
" <th>original_glszm_ZoneVariance</th>\n",
" <th>original_ngtdm_Busyness</th>\n",
" <th>original_ngtdm_Coarseness</th>\n",
" <th>original_ngtdm_Complexity</th>\n",
" <th>original_ngtdm_Contrast</th>\n",
" <th>original_ngtdm_Strength</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TRG002178</td>\n",
" <td>39.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0.444391</td>\n",
" <td>0.444391</td>\n",
" <td>3.032144</td>\n",
" <td>0.005612</td>\n",
" <td>2372009.744</td>\n",
" <td>59.459710</td>\n",
" <td>0.004383</td>\n",
" <td>0.032012</td>\n",
" <td>0.001006</td>\n",
" <td>0.003685</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TRG002260</td>\n",
" <td>46.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0.313170</td>\n",
" <td>0.313170</td>\n",
" <td>3.714752</td>\n",
" <td>0.007246</td>\n",
" <td>1192964.863</td>\n",
" <td>111.893409</td>\n",
" <td>0.002482</td>\n",
" <td>0.079986</td>\n",
" <td>0.005643</td>\n",
" <td>0.002184</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>TRG002268</td>\n",
" <td>47.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.429904</td>\n",
" <td>0.429904</td>\n",
" <td>3.330874</td>\n",
" <td>0.004290</td>\n",
" <td>6083001.390</td>\n",
" <td>170.357955</td>\n",
" <td>0.001556</td>\n",
" <td>0.045200</td>\n",
" <td>0.001885</td>\n",
" <td>0.001360</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TRG002271</td>\n",
" <td>66.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0.543363</td>\n",
" <td>0.543363</td>\n",
" <td>2.324037</td>\n",
" <td>0.004338</td>\n",
" <td>3224591.772</td>\n",
" <td>24.328720</td>\n",
" <td>0.010442</td>\n",
" <td>0.013187</td>\n",
" <td>0.000167</td>\n",
" <td>0.008646</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>TRG002272</td>\n",
" <td>43.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0.262034</td>\n",
" <td>0.262034</td>\n",
" <td>3.261044</td>\n",
" <td>0.001756</td>\n",
" <td>4833932.159</td>\n",
" <td>294.198973</td>\n",
" <td>0.001111</td>\n",
" <td>0.143636</td>\n",
" <td>0.020441</td>\n",
" <td>0.001033</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 118 columns</p>\n",
"</div>"
],
"text/plain": [
" ID Age ER PgR HER2 TrippleNegative ChemoGrade Proliferation \\\n",
"0 TRG002178 39.0 1 1 0 0 3 3 \n",
"1 TRG002260 46.0 1 1 0 0 2 1 \n",
"2 TRG002268 47.0 0 0 0 1 3 3 \n",
"3 TRG002271 66.0 0 0 0 1 2 1 \n",
"4 TRG002272 43.0 0 0 0 1 2 1 \n",
"\n",
" HistologyType LNStatus ... \\\n",
"0 1 1 ... \n",
"1 2 1 ... \n",
"2 1 0 ... \n",
"3 2 1 ... \n",
"4 1 1 ... \n",
"\n",
" original_glszm_SmallAreaHighGrayLevelEmphasis \\\n",
"0 0.444391 \n",
"1 0.313170 \n",
"2 0.429904 \n",
"3 0.543363 \n",
"4 0.262034 \n",
"\n",
" original_glszm_SmallAreaLowGrayLevelEmphasis original_glszm_ZoneEntropy \\\n",
"0 0.444391 3.032144 \n",
"1 0.313170 3.714752 \n",
"2 0.429904 3.330874 \n",
"3 0.543363 2.324037 \n",
"4 0.262034 3.261044 \n",
"\n",
" original_glszm_ZonePercentage original_glszm_ZoneVariance \\\n",
"0 0.005612 2372009.744 \n",
"1 0.007246 1192964.863 \n",
"2 0.004290 6083001.390 \n",
"3 0.004338 3224591.772 \n",
"4 0.001756 4833932.159 \n",
"\n",
" original_ngtdm_Busyness original_ngtdm_Coarseness \\\n",
"0 59.459710 0.004383 \n",
"1 111.893409 0.002482 \n",
"2 170.357955 0.001556 \n",
"3 24.328720 0.010442 \n",
"4 294.198973 0.001111 \n",
"\n",
" original_ngtdm_Complexity original_ngtdm_Contrast original_ngtdm_Strength \n",
"0 0.032012 0.001006 0.003685 \n",
"1 0.079986 0.005643 0.002184 \n",
"2 0.045200 0.001885 0.001360 \n",
"3 0.013187 0.000167 0.008646 \n",
"4 0.143636 0.020441 0.001033 \n",
"\n",
"[5 rows x 118 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Display the dataset read\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "570e8e10",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(100, 118)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5a8cd295",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>ER</th>\n",
" <th>PgR</th>\n",
" <th>HER2</th>\n",
" <th>TrippleNegative</th>\n",
" <th>ChemoGrade</th>\n",
" <th>Proliferation</th>\n",
" <th>HistologyType</th>\n",
" <th>LNStatus</th>\n",
" <th>TumourStage</th>\n",
" <th>...</th>\n",
" <th>original_glszm_SmallAreaHighGrayLevelEmphasis</th>\n",
" <th>original_glszm_SmallAreaLowGrayLevelEmphasis</th>\n",
" <th>original_glszm_ZoneEntropy</th>\n",
" <th>original_glszm_ZonePercentage</th>\n",
" <th>original_glszm_ZoneVariance</th>\n",
" <th>original_ngtdm_Busyness</th>\n",
" <th>original_ngtdm_Coarseness</th>\n",
" <th>original_ngtdm_Complexity</th>\n",
" <th>original_ngtdm_Contrast</th>\n",
" <th>original_ngtdm_Strength</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>100.000000</td>\n",
" <td>100.00</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" <td>100.00</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" <td>...</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" <td>1.000000e+02</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>51.041468</td>\n",
" <td>0.45</td>\n",
" <td>10.290000</td>\n",
" <td>10.350000</td>\n",
" <td>10.340000</td>\n",
" <td>2.35</td>\n",
" <td>1.580000</td>\n",
" <td>1.130000</td>\n",
" <td>0.480000</td>\n",
" <td>2.500000</td>\n",
" <td>...</td>\n",
" <td>0.395316</td>\n",
" <td>0.394949</td>\n",
" <td>2.635484</td>\n",
" <td>0.003003</td>\n",
" <td>2.245330e+07</td>\n",
" <td>329.040924</td>\n",
" <td>10000.040912</td>\n",
" <td>0.056549</td>\n",
" <td>0.006110</td>\n",
" <td>0.037659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>10.632333</td>\n",
" <td>0.50</td>\n",
" <td>99.870754</td>\n",
" <td>99.864795</td>\n",
" <td>99.865791</td>\n",
" <td>0.50</td>\n",
" <td>0.806288</td>\n",
" <td>0.337998</td>\n",
" <td>0.502117</td>\n",
" <td>0.881917</td>\n",
" <td>...</td>\n",
" <td>0.171826</td>\n",
" <td>0.171321</td>\n",
" <td>0.748243</td>\n",
" <td>0.002318</td>\n",
" <td>7.232660e+07</td>\n",
" <td>2071.405376</td>\n",
" <td>99999.995868</td>\n",
" <td>0.051888</td>\n",
" <td>0.009264</td>\n",
" <td>0.127991</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>29.670089</td>\n",
" <td>0.00</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.00</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>...</td>\n",
" <td>0.000021</td>\n",
" <td>0.000008</td>\n",
" <td>0.591673</td>\n",
" <td>0.000125</td>\n",
" <td>3.750289e+04</td>\n",
" <td>0.000000</td>\n",
" <td>0.000393</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>43.802875</td>\n",
" <td>0.00</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>2.00</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>2.000000</td>\n",
" <td>...</td>\n",
" <td>0.319481</td>\n",
" <td>0.319481</td>\n",
" <td>2.198418</td>\n",
" <td>0.001074</td>\n",
" <td>1.169844e+06</td>\n",
" <td>17.189726</td>\n",
" <td>0.001841</td>\n",
" <td>0.011883</td>\n",
" <td>0.000124</td>\n",
" <td>0.001585</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>49.000000</td>\n",
" <td>0.00</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>2.00</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>2.000000</td>\n",
" <td>...</td>\n",
" <td>0.406160</td>\n",
" <td>0.406159</td>\n",
" <td>2.654426</td>\n",
" <td>0.002642</td>\n",
" <td>4.632876e+06</td>\n",
" <td>59.389960</td>\n",
" <td>0.004992</td>\n",
" <td>0.045385</td>\n",
" <td>0.002610</td>\n",
" <td>0.004358</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>59.568788</td>\n",
" <td>1.00</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>3.00</td>\n",
" <td>2.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>3.000000</td>\n",
" <td>...</td>\n",
" <td>0.526561</td>\n",
" <td>0.526561</td>\n",
" <td>3.183633</td>\n",
" <td>0.004273</td>\n",
" <td>1.039883e+07</td>\n",
" <td>158.732133</td>\n",
" <td>0.015543</td>\n",
" <td>0.085816</td>\n",
" <td>0.008510</td>\n",
" <td>0.013397</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>79.603012</td>\n",
" <td>1.00</td>\n",
" <td>999.000000</td>\n",
" <td>999.000000</td>\n",
" <td>999.000000</td>\n",
" <td>3.00</td>\n",
" <td>3.000000</td>\n",
" <td>2.000000</td>\n",
" <td>1.000000</td>\n",
" <td>4.000000</td>\n",
" <td>...</td>\n",
" <td>0.857143</td>\n",
" <td>0.857143</td>\n",
" <td>4.947427</td>\n",
" <td>0.010431</td>\n",
" <td>5.488188e+08</td>\n",
" <td>20764.693790</td>\n",
" <td>1000000.000000</td>\n",
" <td>0.285100</td>\n",
" <td>0.060742</td>\n",
" <td>1.092132</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows × 117 columns</p>\n",
"</div>"
],
"text/plain": [
" Age ER PgR HER2 TrippleNegative \\\n",
"count 100.000000 100.00 100.000000 100.000000 100.000000 \n",
"mean 51.041468 0.45 10.290000 10.350000 10.340000 \n",
"std 10.632333 0.50 99.870754 99.864795 99.865791 \n",
"min 29.670089 0.00 0.000000 0.000000 0.000000 \n",
"25% 43.802875 0.00 0.000000 0.000000 0.000000 \n",
"50% 49.000000 0.00 0.000000 0.000000 0.000000 \n",
"75% 59.568788 1.00 1.000000 1.000000 1.000000 \n",
"max 79.603012 1.00 999.000000 999.000000 999.000000 \n",
"\n",
" ChemoGrade Proliferation HistologyType LNStatus TumourStage ... \\\n",
"count 100.00 100.000000 100.000000 100.000000 100.000000 ... \n",
"mean 2.35 1.580000 1.130000 0.480000 2.500000 ... \n",
"std 0.50 0.806288 0.337998 0.502117 0.881917 ... \n",
"min 1.00 1.000000 1.000000 0.000000 1.000000 ... \n",
"25% 2.00 1.000000 1.000000 0.000000 2.000000 ... \n",
"50% 2.00 1.000000 1.000000 0.000000 2.000000 ... \n",
"75% 3.00 2.000000 1.000000 1.000000 3.000000 ... \n",
"max 3.00 3.000000 2.000000 1.000000 4.000000 ... \n",
"\n",
" original_glszm_SmallAreaHighGrayLevelEmphasis \\\n",
"count 100.000000 \n",
"mean 0.395316 \n",
"std 0.171826 \n",
"min 0.000021 \n",
"25% 0.319481 \n",
"50% 0.406160 \n",
"75% 0.526561 \n",
"max 0.857143 \n",
"\n",
" original_glszm_SmallAreaLowGrayLevelEmphasis \\\n",
"count 100.000000 \n",
"mean 0.394949 \n",
"std 0.171321 \n",
"min 0.000008 \n",
"25% 0.319481 \n",
"50% 0.406159 \n",
"75% 0.526561 \n",
"max 0.857143 \n",
"\n",
" original_glszm_ZoneEntropy original_glszm_ZonePercentage \\\n",
"count 100.000000 100.000000 \n",
"mean 2.635484 0.003003 \n",
"std 0.748243 0.002318 \n",
"min 0.591673 0.000125 \n",
"25% 2.198418 0.001074 \n",
"50% 2.654426 0.002642 \n",
"75% 3.183633 0.004273 \n",
"max 4.947427 0.010431 \n",
"\n",
" original_glszm_ZoneVariance original_ngtdm_Busyness \\\n",
"count 1.000000e+02 100.000000 \n",
"mean 2.245330e+07 329.040924 \n",
"std 7.232660e+07 2071.405376 \n",
"min 3.750289e+04 0.000000 \n",
"25% 1.169844e+06 17.189726 \n",
"50% 4.632876e+06 59.389960 \n",
"75% 1.039883e+07 158.732133 \n",
"max 5.488188e+08 20764.693790 \n",
"\n",
" original_ngtdm_Coarseness original_ngtdm_Complexity \\\n",
"count 100.000000 100.000000 \n",
"mean 10000.040912 0.056549 \n",
"std 99999.995868 0.051888 \n",
"min 0.000393 0.000000 \n",
"25% 0.001841 0.011883 \n",
"50% 0.004992 0.045385 \n",
"75% 0.015543 0.085816 \n",
"max 1000000.000000 0.285100 \n",
"\n",
" original_ngtdm_Contrast original_ngtdm_Strength \n",
"count 100.000000 100.000000 \n",
"mean 0.006110 0.037659 \n",
"std 0.009264 0.127991 \n",
"min 0.000000 0.000000 \n",
"25% 0.000124 0.001585 \n",
"50% 0.002610 0.004358 \n",
"75% 0.008510 0.013397 \n",
"max 0.060742 1.092132 \n",
"\n",
"[8 rows x 117 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ef0cc14a",
"metadata": {},
"outputs": [],
"source": [
"df=df.replace(999, None)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "73ce52f3",
"metadata": {},
"outputs": [],
"source": [
"#Taking the first 12 columns in order to handle the missing values efficiently. \n",
"#The rest columns are values derived from the image scans\n",
"# the ID column is not needed.\n",
"\n",
"Df_ = df.iloc[:,1:13]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "9c5ca240",
"metadata": {},
"outputs": [],
"source": [
"for col in Df_.columns:\n",
" Df_[col].fillna(Df_[col].mode()[0], inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "afa3de24",
"metadata": {},
"outputs": [],
"source": [
"Df_imputed = pd.concat((Df_, df.iloc[:,13:]), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7f278e10",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>ER</th>\n",
" <th>PgR</th>\n",
" <th>HER2</th>\n",
" <th>TrippleNegative</th>\n",
" <th>ChemoGrade</th>\n",
" <th>Proliferation</th>\n",
" <th>HistologyType</th>\n",
" <th>LNStatus</th>\n",
" <th>TumourStage</th>\n",
" <th>...</th>\n",
" <th>original_glszm_SmallAreaHighGrayLevelEmphasis</th>\n",
" <th>original_glszm_SmallAreaLowGrayLevelEmphasis</th>\n",
" <th>original_glszm_ZoneEntropy</th>\n",
" <th>original_glszm_ZonePercentage</th>\n",
" <th>original_glszm_ZoneVariance</th>\n",
" <th>original_ngtdm_Busyness</th>\n",
" <th>original_ngtdm_Coarseness</th>\n",
" <th>original_ngtdm_Complexity</th>\n",
" <th>original_ngtdm_Contrast</th>\n",
" <th>original_ngtdm_Strength</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>39.00000</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0.444391</td>\n",
" <td>0.444391</td>\n",
" <td>3.032144</td>\n",
" <td>0.005612</td>\n",
" <td>2.372010e+06</td>\n",
" <td>59.459710</td>\n",
" <td>0.004383</td>\n",
" <td>0.032012</td>\n",
" <td>1.005722e-03</td>\n",
" <td>0.003685</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>46.00000</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0.313170</td>\n",
" <td>0.313170</td>\n",
" <td>3.714752</td>\n",
" <td>0.007246</td>\n",
" <td>1.192965e+06</td>\n",
" <td>111.893409</td>\n",
" <td>0.002482</td>\n",
" <td>0.079986</td>\n",
" <td>5.642792e-03</td>\n",
" <td>0.002184</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>47.00000</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0.429904</td>\n",
" <td>0.429904</td>\n",
" <td>3.330874</td>\n",
" <td>0.004290</td>\n",
" <td>6.083001e+06</td>\n",
" <td>170.357955</td>\n",
" <td>0.001556</td>\n",
" <td>0.045200</td>\n",
" <td>1.884625e-03</td>\n",
" <td>0.001360</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>66.00000</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>0.543363</td>\n",
" <td>0.543363</td>\n",
" <td>2.324037</td>\n",
" <td>0.004338</td>\n",
" <td>3.224592e+06</td>\n",
" <td>24.328720</td>\n",
" <td>0.010442</td>\n",
" <td>0.013187</td>\n",
" <td>1.670920e-04</td>\n",
" <td>0.008646</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>43.00000</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0.262034</td>\n",
" <td>0.262034</td>\n",
" <td>3.261044</td>\n",
" <td>0.001756</td>\n",
" <td>4.833932e+06</td>\n",
" <td>294.198973</td>\n",
" <td>0.001111</td>\n",
" <td>0.143636</td>\n",
" <td>2.044052e-02</td>\n",
" <td>0.001033</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>51.00000</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0.527778</td>\n",
" <td>0.527778</td>\n",
" <td>1.500000</td>\n",
" <td>0.001450</td>\n",
" <td>1.419345e+06</td>\n",
" <td>0.993682</td>\n",
" <td>0.252276</td>\n",
" <td>0.002874</td>\n",
" <td>5.640000e-06</td>\n",
" <td>0.232840</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>62.12731</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>0.328724</td>\n",
" <td>0.328668</td>\n",
" <td>3.615818</td>\n",
" <td>0.000212</td>\n",
" <td>5.488188e+08</td>\n",
" <td>18.649891</td>\n",
" <td>0.013421</td>\n",
" <td>0.001173</td>\n",
" <td>9.660000e-07</td>\n",
" <td>0.013201</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>48.50000</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0.397601</td>\n",
" <td>0.397601</td>\n",
" <td>3.401123</td>\n",
" <td>0.003057</td>\n",
" <td>6.264910e+06</td>\n",
" <td>205.220652</td>\n",
" <td>0.001349</td>\n",
" <td>0.066663</td>\n",
" <td>4.626845e-03</td>\n",
" <td>0.001174</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>38.50000</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0.410970</td>\n",
" <td>0.410970</td>\n",
" <td>2.792586</td>\n",
" <td>0.005509</td>\n",
" <td>4.708709e+05</td>\n",
" <td>57.290026</td>\n",
" <td>0.005201</td>\n",
" <td>0.111502</td>\n",
" <td>1.218696e-02</td>\n",
" <td>0.004557</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>66.30000</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0.083383</td>\n",
" <td>0.083377</td>\n",
" <td>1.584963</td>\n",
" <td>0.005566</td>\n",
" <td>3.750289e+04</td>\n",
" <td>13.970065</td>\n",
" <td>0.023875</td>\n",
" <td>0.155415</td>\n",
" <td>2.406440e-02</td>\n",
" <td>0.021448</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 117 columns</p>\n",
"</div>"
],
"text/plain": [
" Age ER PgR HER2 TrippleNegative ChemoGrade Proliferation \\\n",
"0 39.00000 1 1 0 0 3 3 \n",
"1 46.00000 1 1 0 0 2 1 \n",
"2 47.00000 0 0 0 1 3 3 \n",
"3 66.00000 0 0 0 1 2 1 \n",
"4 43.00000 0 0 0 1 2 1 \n",
".. ... .. ... ... ... ... ... \n",
"95 51.00000 0 0 0 1 2 1 \n",
"96 62.12731 0 0 0 1 3 2 \n",
"97 48.50000 1 1 1 0 3 3 \n",
"98 38.50000 0 0 1 0 3 3 \n",
"99 66.30000 0 0 1 0 2 1 \n",
"\n",
" HistologyType LNStatus TumourStage ... \\\n",
"0 1 1 2 ... \n",
"1 2 1 2 ... \n",
"2 1 0 2 ... \n",
"3 2 1 4 ... \n",
"4 1 1 2 ... \n",
".. ... ... ... ... \n",
"95 1 1 1 ... \n",
"96 1 1 4 ... \n",
"97 1 0 2 ... \n",
"98 1 1 2 ... \n",
"99 1 0 2 ... \n",
"\n",
" original_glszm_SmallAreaHighGrayLevelEmphasis \\\n",
"0 0.444391 \n",
"1 0.313170 \n",
"2 0.429904 \n",
"3 0.543363 \n",
"4 0.262034 \n",
".. ... \n",
"95 0.527778 \n",
"96 0.328724 \n",
"97 0.397601 \n",
"98 0.410970 \n",
"99 0.083383 \n",
"\n",
" original_glszm_SmallAreaLowGrayLevelEmphasis original_glszm_ZoneEntropy \\\n",
"0 0.444391 3.032144 \n",
"1 0.313170 3.714752 \n",
"2 0.429904 3.330874 \n",
"3 0.543363 2.324037 \n",
"4 0.262034 3.261044 \n",
".. ... ... \n",
"95 0.527778 1.500000 \n",
"96 0.328668 3.615818 \n",
"97 0.397601 3.401123 \n",
"98 0.410970 2.792586 \n",
"99 0.083377 1.584963 \n",
"\n",
" original_glszm_ZonePercentage original_glszm_ZoneVariance \\\n",
"0 0.005612 2.372010e+06 \n",
"1 0.007246 1.192965e+06 \n",
"2 0.004290 6.083001e+06 \n",
"3 0.004338 3.224592e+06 \n",
"4 0.001756 4.833932e+06 \n",
".. ... ... \n",
"95 0.001450 1.419345e+06 \n",
"96 0.000212 5.488188e+08 \n",
"97 0.003057 6.264910e+06 \n",
"98 0.005509 4.708709e+05 \n",
"99 0.005566 3.750289e+04 \n",
"\n",
" original_ngtdm_Busyness original_ngtdm_Coarseness \\\n",
"0 59.459710 0.004383 \n",
"1 111.893409 0.002482 \n",
"2 170.357955 0.001556 \n",
"3 24.328720 0.010442 \n",
"4 294.198973 0.001111 \n",
".. ... ... \n",
"95 0.993682 0.252276 \n",
"96 18.649891 0.013421 \n",
"97 205.220652 0.001349 \n",
"98 57.290026 0.005201 \n",
"99 13.970065 0.023875 \n",
"\n",
" original_ngtdm_Complexity original_ngtdm_Contrast \\\n",
"0 0.032012 1.005722e-03 \n",
"1 0.079986 5.642792e-03 \n",
"2 0.045200 1.884625e-03 \n",
"3 0.013187 1.670920e-04 \n",
"4 0.143636 2.044052e-02 \n",
".. ... ... \n",
"95 0.002874 5.640000e-06 \n",
"96 0.001173 9.660000e-07 \n",
"97 0.066663 4.626845e-03 \n",
"98 0.111502 1.218696e-02 \n",
"99 0.155415 2.406440e-02 \n",
"\n",
" original_ngtdm_Strength \n",
"0 0.003685 \n",
"1 0.002184 \n",
"2 0.001360 \n",
"3 0.008646 \n",
"4 0.001033 \n",
".. ... \n",
"95 0.232840 \n",
"96 0.013201 \n",
"97 0.001174 \n",
"98 0.004557 \n",
"99 0.021448 \n",
"\n",
"[100 rows x 117 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Df_imputed"
]
},
{
"cell_type": "markdown",
"id": "aecaaadb",
"metadata": {},
"source": [
"**Feature selection after using Random Forest for feature selection**"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "917be203",
"metadata": {},
"outputs": [],
"source": [
"selected_column_names = [\"Age\", \"original_firstorder_90Percentile\",\n",
" \"original_firstorder_Kurtosis\", \"original_firstorder_Range\",\n",
" \"original_firstorder_Variance\",\n",
" \"original_glrlm_ShortRunHighGrayLevelEmphasis\",\n",
" \"original_glszm_SizeZoneNonUniformity\"]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "bead9d3f",
"metadata": {},
"outputs": [],
"source": [
"feature_selected = Df_imputed[selected_column_names]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "ccf3c195",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(100, 7)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_selected.shape"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "bdbf898c",
"metadata": {},
"outputs": [],
"source": [
"# Create a scaler object\n",
"scaler = MinMaxScaler()\n",
"\n",
"# Scale the data\n",
"data_scaled = scaler.fit_transform(feature_selected)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "067cc5cc",
"metadata": {},
"outputs": [],
"source": [
"model = pickle.load(open('svr_gridsearch_optimised_using_rf_for_fs.pkl', 'rb'))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "fe678b12",
"metadata": {},
"outputs": [],
"source": [
"#model prediction\n",
"y_pred = model.predict(data_scaled)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "f053c6e7",
"metadata": {},
"outputs": [],
"source": [
"test_predictions = pd.DataFrame(y_pred, columns=['RelapseFreeSurvival (outcome)'])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "67306c7c",
"metadata": {},
"outputs": [],
"source": [
"to_excel_df = pd.DataFrame(df['ID'])"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "a71d3498",
"metadata": {},
"outputs": [],
"source": [
"#Data after handling missing values\n",
"to_excel_df = pd.concat((to_excel_df, test_predictions),axis=1)\n",
"to_excel_df.to_excel('FinalTestRFS.xlsx',index = False, header=True)"
]
},
{
"cell_type": "markdown",
"id": "13f5ea38",
"metadata": {},
"source": [
"**END**"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}