{ "cells": [ { "cell_type": "markdown", "id": "0e877a17", "metadata": {}, "source": [ "**TEST CODE REGRESSION**" ] }, { "cell_type": "code", "execution_count": 1, "id": "a5167c85", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.ensemble import RandomForestRegressor\n", "import numpy as np\n", "import warnings\n", "from sklearn.svm import SVR \n", "from sklearn.metrics import mean_absolute_error\n", "from sklearn.preprocessing import MinMaxScaler\n", "import pickle\n", "\n", "warnings.filterwarnings('ignore') #ignore warning messages" ] }, { "cell_type": "code", "execution_count": 2, "id": "8e81f387", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Enter the file path: TestTrialDataset.xls\n" ] } ], "source": [ "# Get the file path from the user\n", "file_path = input(\"Enter the file path: \")\n", "\n", "# Check the file extension\n", "if file_path.endswith('.csv'):\n", " # Read CSV file\n", " df = pd.read_csv(file_path)\n", "elif file_path.endswith('.xls') or file_path.endswith('.xlsx'):\n", " # Read XLS/XLSX file\n", " df = pd.read_excel(file_path)\n", "else:\n", " print(\"Invalid file format. Please provide a CSV or XLS/XLSX file.\")\n", " exit()" ] }, { "cell_type": "markdown", "id": "f712f600", "metadata": {}, "source": [ "**Data analysis and preprocessing**" ] }, { "cell_type": "code", "execution_count": 3, "id": "7d90d850", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | ID | \n", "Age | \n", "ER | \n", "PgR | \n", "HER2 | \n", "TrippleNegative | \n", "ChemoGrade | \n", "Proliferation | \n", "HistologyType | \n", "LNStatus | \n", "... | \n", "original_glszm_SmallAreaHighGrayLevelEmphasis | \n", "original_glszm_SmallAreaLowGrayLevelEmphasis | \n", "original_glszm_ZoneEntropy | \n", "original_glszm_ZonePercentage | \n", "original_glszm_ZoneVariance | \n", "original_ngtdm_Busyness | \n", "original_ngtdm_Coarseness | \n", "original_ngtdm_Complexity | \n", "original_ngtdm_Contrast | \n", "original_ngtdm_Strength | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "TRG002178 | \n", "39.0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "3 | \n", "3 | \n", "1 | \n", "1 | \n", "... | \n", "0.444391 | \n", "0.444391 | \n", "3.032144 | \n", "0.005612 | \n", "2372009.744 | \n", "59.459710 | \n", "0.004383 | \n", "0.032012 | \n", "0.001006 | \n", "0.003685 | \n", "
1 | \n", "TRG002260 | \n", "46.0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "2 | \n", "1 | \n", "2 | \n", "1 | \n", "... | \n", "0.313170 | \n", "0.313170 | \n", "3.714752 | \n", "0.007246 | \n", "1192964.863 | \n", "111.893409 | \n", "0.002482 | \n", "0.079986 | \n", "0.005643 | \n", "0.002184 | \n", "
2 | \n", "TRG002268 | \n", "47.0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "3 | \n", "3 | \n", "1 | \n", "0 | \n", "... | \n", "0.429904 | \n", "0.429904 | \n", "3.330874 | \n", "0.004290 | \n", "6083001.390 | \n", "170.357955 | \n", "0.001556 | \n", "0.045200 | \n", "0.001885 | \n", "0.001360 | \n", "
3 | \n", "TRG002271 | \n", "66.0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "2 | \n", "1 | \n", "2 | \n", "1 | \n", "... | \n", "0.543363 | \n", "0.543363 | \n", "2.324037 | \n", "0.004338 | \n", "3224591.772 | \n", "24.328720 | \n", "0.010442 | \n", "0.013187 | \n", "0.000167 | \n", "0.008646 | \n", "
4 | \n", "TRG002272 | \n", "43.0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "2 | \n", "1 | \n", "1 | \n", "1 | \n", "... | \n", "0.262034 | \n", "0.262034 | \n", "3.261044 | \n", "0.001756 | \n", "4833932.159 | \n", "294.198973 | \n", "0.001111 | \n", "0.143636 | \n", "0.020441 | \n", "0.001033 | \n", "
5 rows × 118 columns
\n", "\n", " | Age | \n", "ER | \n", "PgR | \n", "HER2 | \n", "TrippleNegative | \n", "ChemoGrade | \n", "Proliferation | \n", "HistologyType | \n", "LNStatus | \n", "TumourStage | \n", "... | \n", "original_glszm_SmallAreaHighGrayLevelEmphasis | \n", "original_glszm_SmallAreaLowGrayLevelEmphasis | \n", "original_glszm_ZoneEntropy | \n", "original_glszm_ZonePercentage | \n", "original_glszm_ZoneVariance | \n", "original_ngtdm_Busyness | \n", "original_ngtdm_Coarseness | \n", "original_ngtdm_Complexity | \n", "original_ngtdm_Contrast | \n", "original_ngtdm_Strength | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | \n", "100.000000 | \n", "100.00 | \n", "100.000000 | \n", "100.000000 | \n", "100.000000 | \n", "100.00 | \n", "100.000000 | \n", "100.000000 | \n", "100.000000 | \n", "100.000000 | \n", "... | \n", "100.000000 | \n", "100.000000 | \n", "100.000000 | \n", "100.000000 | \n", "1.000000e+02 | \n", "100.000000 | \n", "100.000000 | \n", "100.000000 | \n", "100.000000 | \n", "100.000000 | \n", "
mean | \n", "51.041468 | \n", "0.45 | \n", "10.290000 | \n", "10.350000 | \n", "10.340000 | \n", "2.35 | \n", "1.580000 | \n", "1.130000 | \n", "0.480000 | \n", "2.500000 | \n", "... | \n", "0.395316 | \n", "0.394949 | \n", "2.635484 | \n", "0.003003 | \n", "2.245330e+07 | \n", "329.040924 | \n", "10000.040912 | \n", "0.056549 | \n", "0.006110 | \n", "0.037659 | \n", "
std | \n", "10.632333 | \n", "0.50 | \n", "99.870754 | \n", "99.864795 | \n", "99.865791 | \n", "0.50 | \n", "0.806288 | \n", "0.337998 | \n", "0.502117 | \n", "0.881917 | \n", "... | \n", "0.171826 | \n", "0.171321 | \n", "0.748243 | \n", "0.002318 | \n", "7.232660e+07 | \n", "2071.405376 | \n", "99999.995868 | \n", "0.051888 | \n", "0.009264 | \n", "0.127991 | \n", "
min | \n", "29.670089 | \n", "0.00 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "1.00 | \n", "1.000000 | \n", "1.000000 | \n", "0.000000 | \n", "1.000000 | \n", "... | \n", "0.000021 | \n", "0.000008 | \n", "0.591673 | \n", "0.000125 | \n", "3.750289e+04 | \n", "0.000000 | \n", "0.000393 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
25% | \n", "43.802875 | \n", "0.00 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "2.00 | \n", "1.000000 | \n", "1.000000 | \n", "0.000000 | \n", "2.000000 | \n", "... | \n", "0.319481 | \n", "0.319481 | \n", "2.198418 | \n", "0.001074 | \n", "1.169844e+06 | \n", "17.189726 | \n", "0.001841 | \n", "0.011883 | \n", "0.000124 | \n", "0.001585 | \n", "
50% | \n", "49.000000 | \n", "0.00 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "2.00 | \n", "1.000000 | \n", "1.000000 | \n", "0.000000 | \n", "2.000000 | \n", "... | \n", "0.406160 | \n", "0.406159 | \n", "2.654426 | \n", "0.002642 | \n", "4.632876e+06 | \n", "59.389960 | \n", "0.004992 | \n", "0.045385 | \n", "0.002610 | \n", "0.004358 | \n", "
75% | \n", "59.568788 | \n", "1.00 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "3.00 | \n", "2.000000 | \n", "1.000000 | \n", "1.000000 | \n", "3.000000 | \n", "... | \n", "0.526561 | \n", "0.526561 | \n", "3.183633 | \n", "0.004273 | \n", "1.039883e+07 | \n", "158.732133 | \n", "0.015543 | \n", "0.085816 | \n", "0.008510 | \n", "0.013397 | \n", "
max | \n", "79.603012 | \n", "1.00 | \n", "999.000000 | \n", "999.000000 | \n", "999.000000 | \n", "3.00 | \n", "3.000000 | \n", "2.000000 | \n", "1.000000 | \n", "4.000000 | \n", "... | \n", "0.857143 | \n", "0.857143 | \n", "4.947427 | \n", "0.010431 | \n", "5.488188e+08 | \n", "20764.693790 | \n", "1000000.000000 | \n", "0.285100 | \n", "0.060742 | \n", "1.092132 | \n", "
8 rows × 117 columns
\n", "\n", " | Age | \n", "ER | \n", "PgR | \n", "HER2 | \n", "TrippleNegative | \n", "ChemoGrade | \n", "Proliferation | \n", "HistologyType | \n", "LNStatus | \n", "TumourStage | \n", "... | \n", "original_glszm_SmallAreaHighGrayLevelEmphasis | \n", "original_glszm_SmallAreaLowGrayLevelEmphasis | \n", "original_glszm_ZoneEntropy | \n", "original_glszm_ZonePercentage | \n", "original_glszm_ZoneVariance | \n", "original_ngtdm_Busyness | \n", "original_ngtdm_Coarseness | \n", "original_ngtdm_Complexity | \n", "original_ngtdm_Contrast | \n", "original_ngtdm_Strength | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "39.00000 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "3 | \n", "3 | \n", "1 | \n", "1 | \n", "2 | \n", "... | \n", "0.444391 | \n", "0.444391 | \n", "3.032144 | \n", "0.005612 | \n", "2.372010e+06 | \n", "59.459710 | \n", "0.004383 | \n", "0.032012 | \n", "1.005722e-03 | \n", "0.003685 | \n", "
1 | \n", "46.00000 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "2 | \n", "1 | \n", "2 | \n", "1 | \n", "2 | \n", "... | \n", "0.313170 | \n", "0.313170 | \n", "3.714752 | \n", "0.007246 | \n", "1.192965e+06 | \n", "111.893409 | \n", "0.002482 | \n", "0.079986 | \n", "5.642792e-03 | \n", "0.002184 | \n", "
2 | \n", "47.00000 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "3 | \n", "3 | \n", "1 | \n", "0 | \n", "2 | \n", "... | \n", "0.429904 | \n", "0.429904 | \n", "3.330874 | \n", "0.004290 | \n", "6.083001e+06 | \n", "170.357955 | \n", "0.001556 | \n", "0.045200 | \n", "1.884625e-03 | \n", "0.001360 | \n", "
3 | \n", "66.00000 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "2 | \n", "1 | \n", "2 | \n", "1 | \n", "4 | \n", "... | \n", "0.543363 | \n", "0.543363 | \n", "2.324037 | \n", "0.004338 | \n", "3.224592e+06 | \n", "24.328720 | \n", "0.010442 | \n", "0.013187 | \n", "1.670920e-04 | \n", "0.008646 | \n", "
4 | \n", "43.00000 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "2 | \n", "1 | \n", "1 | \n", "1 | \n", "2 | \n", "... | \n", "0.262034 | \n", "0.262034 | \n", "3.261044 | \n", "0.001756 | \n", "4.833932e+06 | \n", "294.198973 | \n", "0.001111 | \n", "0.143636 | \n", "2.044052e-02 | \n", "0.001033 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
95 | \n", "51.00000 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "2 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "... | \n", "0.527778 | \n", "0.527778 | \n", "1.500000 | \n", "0.001450 | \n", "1.419345e+06 | \n", "0.993682 | \n", "0.252276 | \n", "0.002874 | \n", "5.640000e-06 | \n", "0.232840 | \n", "
96 | \n", "62.12731 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "3 | \n", "2 | \n", "1 | \n", "1 | \n", "4 | \n", "... | \n", "0.328724 | \n", "0.328668 | \n", "3.615818 | \n", "0.000212 | \n", "5.488188e+08 | \n", "18.649891 | \n", "0.013421 | \n", "0.001173 | \n", "9.660000e-07 | \n", "0.013201 | \n", "
97 | \n", "48.50000 | \n", "1 | \n", "1 | \n", "1 | \n", "0 | \n", "3 | \n", "3 | \n", "1 | \n", "0 | \n", "2 | \n", "... | \n", "0.397601 | \n", "0.397601 | \n", "3.401123 | \n", "0.003057 | \n", "6.264910e+06 | \n", "205.220652 | \n", "0.001349 | \n", "0.066663 | \n", "4.626845e-03 | \n", "0.001174 | \n", "
98 | \n", "38.50000 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "3 | \n", "3 | \n", "1 | \n", "1 | \n", "2 | \n", "... | \n", "0.410970 | \n", "0.410970 | \n", "2.792586 | \n", "0.005509 | \n", "4.708709e+05 | \n", "57.290026 | \n", "0.005201 | \n", "0.111502 | \n", "1.218696e-02 | \n", "0.004557 | \n", "
99 | \n", "66.30000 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "2 | \n", "1 | \n", "1 | \n", "0 | \n", "2 | \n", "... | \n", "0.083383 | \n", "0.083377 | \n", "1.584963 | \n", "0.005566 | \n", "3.750289e+04 | \n", "13.970065 | \n", "0.023875 | \n", "0.155415 | \n", "2.406440e-02 | \n", "0.021448 | \n", "
100 rows × 117 columns
\n", "