{ "cells": [ { "cell_type": "markdown", "id": "0e877a17", "metadata": {}, "source": [ "**TEST CODE REGRESSION**" ] }, { "cell_type": "code", "execution_count": 1, "id": "a5167c85", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.ensemble import RandomForestRegressor\n", "import numpy as np\n", "import warnings\n", "from sklearn.svm import SVR \n", "from sklearn.metrics import mean_absolute_error\n", "from sklearn.preprocessing import MinMaxScaler\n", "import pickle\n", "\n", "warnings.filterwarnings('ignore') #ignore warning messages" ] }, { "cell_type": "code", "execution_count": 2, "id": "8e81f387", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Enter the file path: TestTrialDataset.xls\n" ] } ], "source": [ "# Get the file path from the user\n", "file_path = input(\"Enter the file path: \")\n", "\n", "# Check the file extension\n", "if file_path.endswith('.csv'):\n", " # Read CSV file\n", " df = pd.read_csv(file_path)\n", "elif file_path.endswith('.xls') or file_path.endswith('.xlsx'):\n", " # Read XLS/XLSX file\n", " df = pd.read_excel(file_path)\n", "else:\n", " print(\"Invalid file format. Please provide a CSV or XLS/XLSX file.\")\n", " exit()" ] }, { "cell_type": "markdown", "id": "f712f600", "metadata": {}, "source": [ "**Data analysis and preprocessing**" ] }, { "cell_type": "code", "execution_count": 3, "id": "7d90d850", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDAgeERPgRHER2TrippleNegativeChemoGradeProliferationHistologyTypeLNStatus...original_glszm_SmallAreaHighGrayLevelEmphasisoriginal_glszm_SmallAreaLowGrayLevelEmphasisoriginal_glszm_ZoneEntropyoriginal_glszm_ZonePercentageoriginal_glszm_ZoneVarianceoriginal_ngtdm_Busynessoriginal_ngtdm_Coarsenessoriginal_ngtdm_Complexityoriginal_ngtdm_Contrastoriginal_ngtdm_Strength
0TRG00217839.011003311...0.4443910.4443913.0321440.0056122372009.74459.4597100.0043830.0320120.0010060.003685
1TRG00226046.011002121...0.3131700.3131703.7147520.0072461192964.863111.8934090.0024820.0799860.0056430.002184
2TRG00226847.000013310...0.4299040.4299043.3308740.0042906083001.390170.3579550.0015560.0452000.0018850.001360
3TRG00227166.000012121...0.5433630.5433632.3240370.0043383224591.77224.3287200.0104420.0131870.0001670.008646
4TRG00227243.000012111...0.2620340.2620343.2610440.0017564833932.159294.1989730.0011110.1436360.0204410.001033
\n", "

5 rows × 118 columns

\n", "
" ], "text/plain": [ " ID Age ER PgR HER2 TrippleNegative ChemoGrade Proliferation \\\n", "0 TRG002178 39.0 1 1 0 0 3 3 \n", "1 TRG002260 46.0 1 1 0 0 2 1 \n", "2 TRG002268 47.0 0 0 0 1 3 3 \n", "3 TRG002271 66.0 0 0 0 1 2 1 \n", "4 TRG002272 43.0 0 0 0 1 2 1 \n", "\n", " HistologyType LNStatus ... \\\n", "0 1 1 ... \n", "1 2 1 ... \n", "2 1 0 ... \n", "3 2 1 ... \n", "4 1 1 ... \n", "\n", " original_glszm_SmallAreaHighGrayLevelEmphasis \\\n", "0 0.444391 \n", "1 0.313170 \n", "2 0.429904 \n", "3 0.543363 \n", "4 0.262034 \n", "\n", " original_glszm_SmallAreaLowGrayLevelEmphasis original_glszm_ZoneEntropy \\\n", "0 0.444391 3.032144 \n", "1 0.313170 3.714752 \n", "2 0.429904 3.330874 \n", "3 0.543363 2.324037 \n", "4 0.262034 3.261044 \n", "\n", " original_glszm_ZonePercentage original_glszm_ZoneVariance \\\n", "0 0.005612 2372009.744 \n", "1 0.007246 1192964.863 \n", "2 0.004290 6083001.390 \n", "3 0.004338 3224591.772 \n", "4 0.001756 4833932.159 \n", "\n", " original_ngtdm_Busyness original_ngtdm_Coarseness \\\n", "0 59.459710 0.004383 \n", "1 111.893409 0.002482 \n", "2 170.357955 0.001556 \n", "3 24.328720 0.010442 \n", "4 294.198973 0.001111 \n", "\n", " original_ngtdm_Complexity original_ngtdm_Contrast original_ngtdm_Strength \n", "0 0.032012 0.001006 0.003685 \n", "1 0.079986 0.005643 0.002184 \n", "2 0.045200 0.001885 0.001360 \n", "3 0.013187 0.000167 0.008646 \n", "4 0.143636 0.020441 0.001033 \n", "\n", "[5 rows x 118 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Display the dataset read\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "570e8e10", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(100, 118)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 5, "id": "5a8cd295", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeERPgRHER2TrippleNegativeChemoGradeProliferationHistologyTypeLNStatusTumourStage...original_glszm_SmallAreaHighGrayLevelEmphasisoriginal_glszm_SmallAreaLowGrayLevelEmphasisoriginal_glszm_ZoneEntropyoriginal_glszm_ZonePercentageoriginal_glszm_ZoneVarianceoriginal_ngtdm_Busynessoriginal_ngtdm_Coarsenessoriginal_ngtdm_Complexityoriginal_ngtdm_Contrastoriginal_ngtdm_Strength
count100.000000100.00100.000000100.000000100.000000100.00100.000000100.000000100.000000100.000000...100.000000100.000000100.000000100.0000001.000000e+02100.000000100.000000100.000000100.000000100.000000
mean51.0414680.4510.29000010.35000010.3400002.351.5800001.1300000.4800002.500000...0.3953160.3949492.6354840.0030032.245330e+07329.04092410000.0409120.0565490.0061100.037659
std10.6323330.5099.87075499.86479599.8657910.500.8062880.3379980.5021170.881917...0.1718260.1713210.7482430.0023187.232660e+072071.40537699999.9958680.0518880.0092640.127991
min29.6700890.000.0000000.0000000.0000001.001.0000001.0000000.0000001.000000...0.0000210.0000080.5916730.0001253.750289e+040.0000000.0003930.0000000.0000000.000000
25%43.8028750.000.0000000.0000000.0000002.001.0000001.0000000.0000002.000000...0.3194810.3194812.1984180.0010741.169844e+0617.1897260.0018410.0118830.0001240.001585
50%49.0000000.000.0000000.0000000.0000002.001.0000001.0000000.0000002.000000...0.4061600.4061592.6544260.0026424.632876e+0659.3899600.0049920.0453850.0026100.004358
75%59.5687881.001.0000001.0000001.0000003.002.0000001.0000001.0000003.000000...0.5265610.5265613.1836330.0042731.039883e+07158.7321330.0155430.0858160.0085100.013397
max79.6030121.00999.000000999.000000999.0000003.003.0000002.0000001.0000004.000000...0.8571430.8571434.9474270.0104315.488188e+0820764.6937901000000.0000000.2851000.0607421.092132
\n", "

8 rows × 117 columns

\n", "
" ], "text/plain": [ " Age ER PgR HER2 TrippleNegative \\\n", "count 100.000000 100.00 100.000000 100.000000 100.000000 \n", "mean 51.041468 0.45 10.290000 10.350000 10.340000 \n", "std 10.632333 0.50 99.870754 99.864795 99.865791 \n", "min 29.670089 0.00 0.000000 0.000000 0.000000 \n", "25% 43.802875 0.00 0.000000 0.000000 0.000000 \n", "50% 49.000000 0.00 0.000000 0.000000 0.000000 \n", "75% 59.568788 1.00 1.000000 1.000000 1.000000 \n", "max 79.603012 1.00 999.000000 999.000000 999.000000 \n", "\n", " ChemoGrade Proliferation HistologyType LNStatus TumourStage ... \\\n", "count 100.00 100.000000 100.000000 100.000000 100.000000 ... \n", "mean 2.35 1.580000 1.130000 0.480000 2.500000 ... \n", "std 0.50 0.806288 0.337998 0.502117 0.881917 ... \n", "min 1.00 1.000000 1.000000 0.000000 1.000000 ... \n", "25% 2.00 1.000000 1.000000 0.000000 2.000000 ... \n", "50% 2.00 1.000000 1.000000 0.000000 2.000000 ... \n", "75% 3.00 2.000000 1.000000 1.000000 3.000000 ... \n", "max 3.00 3.000000 2.000000 1.000000 4.000000 ... \n", "\n", " original_glszm_SmallAreaHighGrayLevelEmphasis \\\n", "count 100.000000 \n", "mean 0.395316 \n", "std 0.171826 \n", "min 0.000021 \n", "25% 0.319481 \n", "50% 0.406160 \n", "75% 0.526561 \n", "max 0.857143 \n", "\n", " original_glszm_SmallAreaLowGrayLevelEmphasis \\\n", "count 100.000000 \n", "mean 0.394949 \n", "std 0.171321 \n", "min 0.000008 \n", "25% 0.319481 \n", "50% 0.406159 \n", "75% 0.526561 \n", "max 0.857143 \n", "\n", " original_glszm_ZoneEntropy original_glszm_ZonePercentage \\\n", "count 100.000000 100.000000 \n", "mean 2.635484 0.003003 \n", "std 0.748243 0.002318 \n", "min 0.591673 0.000125 \n", "25% 2.198418 0.001074 \n", "50% 2.654426 0.002642 \n", "75% 3.183633 0.004273 \n", "max 4.947427 0.010431 \n", "\n", " original_glszm_ZoneVariance original_ngtdm_Busyness \\\n", "count 1.000000e+02 100.000000 \n", "mean 2.245330e+07 329.040924 \n", "std 7.232660e+07 2071.405376 \n", "min 3.750289e+04 0.000000 \n", "25% 1.169844e+06 17.189726 \n", "50% 4.632876e+06 59.389960 \n", "75% 1.039883e+07 158.732133 \n", "max 5.488188e+08 20764.693790 \n", "\n", " original_ngtdm_Coarseness original_ngtdm_Complexity \\\n", "count 100.000000 100.000000 \n", "mean 10000.040912 0.056549 \n", "std 99999.995868 0.051888 \n", "min 0.000393 0.000000 \n", "25% 0.001841 0.011883 \n", "50% 0.004992 0.045385 \n", "75% 0.015543 0.085816 \n", "max 1000000.000000 0.285100 \n", "\n", " original_ngtdm_Contrast original_ngtdm_Strength \n", "count 100.000000 100.000000 \n", "mean 0.006110 0.037659 \n", "std 0.009264 0.127991 \n", "min 0.000000 0.000000 \n", "25% 0.000124 0.001585 \n", "50% 0.002610 0.004358 \n", "75% 0.008510 0.013397 \n", "max 0.060742 1.092132 \n", "\n", "[8 rows x 117 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 6, "id": "ef0cc14a", "metadata": {}, "outputs": [], "source": [ "df=df.replace(999, None)" ] }, { "cell_type": "code", "execution_count": 7, "id": "73ce52f3", "metadata": {}, "outputs": [], "source": [ "#Taking the first 12 columns in order to handle the missing values efficiently. \n", "#The rest columns are values derived from the image scans\n", "# the ID column is not needed.\n", "\n", "Df_ = df.iloc[:,1:13]" ] }, { "cell_type": "code", "execution_count": 8, "id": "9c5ca240", "metadata": {}, "outputs": [], "source": [ "for col in Df_.columns:\n", " Df_[col].fillna(Df_[col].mode()[0], inplace=True)" ] }, { "cell_type": "code", "execution_count": 9, "id": "afa3de24", "metadata": {}, "outputs": [], "source": [ "Df_imputed = pd.concat((Df_, df.iloc[:,13:]), axis=1)" ] }, { "cell_type": "code", "execution_count": 11, "id": "7f278e10", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AgeERPgRHER2TrippleNegativeChemoGradeProliferationHistologyTypeLNStatusTumourStage...original_glszm_SmallAreaHighGrayLevelEmphasisoriginal_glszm_SmallAreaLowGrayLevelEmphasisoriginal_glszm_ZoneEntropyoriginal_glszm_ZonePercentageoriginal_glszm_ZoneVarianceoriginal_ngtdm_Busynessoriginal_ngtdm_Coarsenessoriginal_ngtdm_Complexityoriginal_ngtdm_Contrastoriginal_ngtdm_Strength
039.00000110033112...0.4443910.4443913.0321440.0056122.372010e+0659.4597100.0043830.0320121.005722e-030.003685
146.00000110021212...0.3131700.3131703.7147520.0072461.192965e+06111.8934090.0024820.0799865.642792e-030.002184
247.00000000133102...0.4299040.4299043.3308740.0042906.083001e+06170.3579550.0015560.0452001.884625e-030.001360
366.00000000121214...0.5433630.5433632.3240370.0043383.224592e+0624.3287200.0104420.0131871.670920e-040.008646
443.00000000121112...0.2620340.2620343.2610440.0017564.833932e+06294.1989730.0011110.1436362.044052e-020.001033
..................................................................
9551.00000000121111...0.5277780.5277781.5000000.0014501.419345e+060.9936820.2522760.0028745.640000e-060.232840
9662.12731000132114...0.3287240.3286683.6158180.0002125.488188e+0818.6498910.0134210.0011739.660000e-070.013201
9748.50000111033102...0.3976010.3976013.4011230.0030576.264910e+06205.2206520.0013490.0666634.626845e-030.001174
9838.50000001033112...0.4109700.4109702.7925860.0055094.708709e+0557.2900260.0052010.1115021.218696e-020.004557
9966.30000001021102...0.0833830.0833771.5849630.0055663.750289e+0413.9700650.0238750.1554152.406440e-020.021448
\n", "

100 rows × 117 columns

\n", "
" ], "text/plain": [ " Age ER PgR HER2 TrippleNegative ChemoGrade Proliferation \\\n", "0 39.00000 1 1 0 0 3 3 \n", "1 46.00000 1 1 0 0 2 1 \n", "2 47.00000 0 0 0 1 3 3 \n", "3 66.00000 0 0 0 1 2 1 \n", "4 43.00000 0 0 0 1 2 1 \n", ".. ... .. ... ... ... ... ... \n", "95 51.00000 0 0 0 1 2 1 \n", "96 62.12731 0 0 0 1 3 2 \n", "97 48.50000 1 1 1 0 3 3 \n", "98 38.50000 0 0 1 0 3 3 \n", "99 66.30000 0 0 1 0 2 1 \n", "\n", " HistologyType LNStatus TumourStage ... \\\n", "0 1 1 2 ... \n", "1 2 1 2 ... \n", "2 1 0 2 ... \n", "3 2 1 4 ... \n", "4 1 1 2 ... \n", ".. ... ... ... ... \n", "95 1 1 1 ... \n", "96 1 1 4 ... \n", "97 1 0 2 ... \n", "98 1 1 2 ... \n", "99 1 0 2 ... \n", "\n", " original_glszm_SmallAreaHighGrayLevelEmphasis \\\n", "0 0.444391 \n", "1 0.313170 \n", "2 0.429904 \n", "3 0.543363 \n", "4 0.262034 \n", ".. ... \n", "95 0.527778 \n", "96 0.328724 \n", "97 0.397601 \n", "98 0.410970 \n", "99 0.083383 \n", "\n", " original_glszm_SmallAreaLowGrayLevelEmphasis original_glszm_ZoneEntropy \\\n", "0 0.444391 3.032144 \n", "1 0.313170 3.714752 \n", "2 0.429904 3.330874 \n", "3 0.543363 2.324037 \n", "4 0.262034 3.261044 \n", ".. ... ... \n", "95 0.527778 1.500000 \n", "96 0.328668 3.615818 \n", "97 0.397601 3.401123 \n", "98 0.410970 2.792586 \n", "99 0.083377 1.584963 \n", "\n", " original_glszm_ZonePercentage original_glszm_ZoneVariance \\\n", "0 0.005612 2.372010e+06 \n", "1 0.007246 1.192965e+06 \n", "2 0.004290 6.083001e+06 \n", "3 0.004338 3.224592e+06 \n", "4 0.001756 4.833932e+06 \n", ".. ... ... \n", "95 0.001450 1.419345e+06 \n", "96 0.000212 5.488188e+08 \n", "97 0.003057 6.264910e+06 \n", "98 0.005509 4.708709e+05 \n", "99 0.005566 3.750289e+04 \n", "\n", " original_ngtdm_Busyness original_ngtdm_Coarseness \\\n", "0 59.459710 0.004383 \n", "1 111.893409 0.002482 \n", "2 170.357955 0.001556 \n", "3 24.328720 0.010442 \n", "4 294.198973 0.001111 \n", ".. ... ... \n", "95 0.993682 0.252276 \n", "96 18.649891 0.013421 \n", "97 205.220652 0.001349 \n", "98 57.290026 0.005201 \n", "99 13.970065 0.023875 \n", "\n", " original_ngtdm_Complexity original_ngtdm_Contrast \\\n", "0 0.032012 1.005722e-03 \n", "1 0.079986 5.642792e-03 \n", "2 0.045200 1.884625e-03 \n", "3 0.013187 1.670920e-04 \n", "4 0.143636 2.044052e-02 \n", ".. ... ... \n", "95 0.002874 5.640000e-06 \n", "96 0.001173 9.660000e-07 \n", "97 0.066663 4.626845e-03 \n", "98 0.111502 1.218696e-02 \n", "99 0.155415 2.406440e-02 \n", "\n", " original_ngtdm_Strength \n", "0 0.003685 \n", "1 0.002184 \n", "2 0.001360 \n", "3 0.008646 \n", "4 0.001033 \n", ".. ... \n", "95 0.232840 \n", "96 0.013201 \n", "97 0.001174 \n", "98 0.004557 \n", "99 0.021448 \n", "\n", "[100 rows x 117 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Df_imputed" ] }, { "cell_type": "markdown", "id": "aecaaadb", "metadata": {}, "source": [ "**Feature selection after using Random Forest for feature selection**" ] }, { "cell_type": "code", "execution_count": 14, "id": "917be203", "metadata": {}, "outputs": [], "source": [ "selected_column_names = [\"Age\", \"original_firstorder_90Percentile\",\n", " \"original_firstorder_Kurtosis\", \"original_firstorder_Range\",\n", " \"original_firstorder_Variance\",\n", " \"original_glrlm_ShortRunHighGrayLevelEmphasis\",\n", " \"original_glszm_SizeZoneNonUniformity\"]" ] }, { "cell_type": "code", "execution_count": 15, "id": "bead9d3f", "metadata": {}, "outputs": [], "source": [ "feature_selected = Df_imputed[selected_column_names]" ] }, { "cell_type": "code", "execution_count": 17, "id": "ccf3c195", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(100, 7)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "feature_selected.shape" ] }, { "cell_type": "code", "execution_count": 18, "id": "bdbf898c", "metadata": {}, "outputs": [], "source": [ "# Create a scaler object\n", "scaler = MinMaxScaler()\n", "\n", "# Scale the data\n", "data_scaled = scaler.fit_transform(feature_selected)" ] }, { "cell_type": "code", "execution_count": 19, "id": "067cc5cc", "metadata": {}, "outputs": [], "source": [ "model = pickle.load(open('svr_gridsearch_optimised_using_rf_for_fs.pkl', 'rb'))" ] }, { "cell_type": "code", "execution_count": 20, "id": "fe678b12", "metadata": {}, "outputs": [], "source": [ "#model prediction\n", "y_pred = model.predict(data_scaled)" ] }, { "cell_type": "code", "execution_count": 21, "id": "f053c6e7", "metadata": {}, "outputs": [], "source": [ "test_predictions = pd.DataFrame(y_pred, columns=['RelapseFreeSurvival (outcome)'])" ] }, { "cell_type": "code", "execution_count": 22, "id": "67306c7c", "metadata": {}, "outputs": [], "source": [ "to_excel_df = pd.DataFrame(df['ID'])" ] }, { "cell_type": "code", "execution_count": 23, "id": "a71d3498", "metadata": {}, "outputs": [], "source": [ "#Data after handling missing values\n", "to_excel_df = pd.concat((to_excel_df, test_predictions),axis=1)\n", "to_excel_df.to_excel('FinalTestRFS.xlsx',index = False, header=True)" ] }, { "cell_type": "markdown", "id": "13f5ea38", "metadata": {}, "source": [ "**END**" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }