{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e15ff842", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import warnings\n", "import plotly.offline as py\n", "# py.init_notebook_mode(connected=True)\n", "import plotly.graph_objs as go\n", "import plotly.tools as tls\n", "import plotly.figure_factory as ff\n", "import seaborn as sns\n", "# warnings.filterwarnings('ignore') #ignore warning messages" ] }, { "cell_type": "code", "execution_count": 2, "id": "cf08041d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDpCR (outcome)RelapseFreeSurvival (outcome)AgeERPgRHER2TrippleNegativeChemoGradeProliferation...original_glszm_SmallAreaHighGrayLevelEmphasisoriginal_glszm_SmallAreaLowGrayLevelEmphasisoriginal_glszm_ZoneEntropyoriginal_glszm_ZonePercentageoriginal_glszm_ZoneVarianceoriginal_ngtdm_Busynessoriginal_ngtdm_Coarsenessoriginal_ngtdm_Complexityoriginal_ngtdm_Contrastoriginal_ngtdm_Strength
395TRG002948054.50000058.5101032...0.4764930.4764932.4535830.0032292.327038e+0618.5623770.0137660.0180420.0002880.012257
396TRG002954049.25000034.3000133...0.4183820.4183822.9956030.0042431.005061e+06156.6271790.0022280.1360150.0221480.002098
397TRG002958048.50000053.3000121...0.5277790.5277781.5000000.0037282.132007e+050.9967460.2525820.0073800.0000370.231059
398TRG002961047.50000068.8100033...0.3136930.3136933.5735570.0011122.008034e+07204.8642000.0013720.0540630.0036970.001368
399TRG002962046.91666746.0100021...0.6702290.6702291.8570450.0067065.609262e+059.6091630.0265910.0186820.0003110.022676
\n", "

5 rows × 120 columns

\n", "
" ], "text/plain": [ " ID pCR (outcome) RelapseFreeSurvival (outcome) Age ER PgR \\\n", "395 TRG002948 0 54.500000 58.5 1 0 \n", "396 TRG002954 0 49.250000 34.3 0 0 \n", "397 TRG002958 0 48.500000 53.3 0 0 \n", "398 TRG002961 0 47.500000 68.8 1 0 \n", "399 TRG002962 0 46.916667 46.0 1 0 \n", "\n", " HER2 TrippleNegative ChemoGrade Proliferation ... \\\n", "395 1 0 3 2 ... \n", "396 0 1 3 3 ... \n", "397 0 1 2 1 ... \n", "398 0 0 3 3 ... \n", "399 0 0 2 1 ... \n", "\n", " original_glszm_SmallAreaHighGrayLevelEmphasis \\\n", "395 0.476493 \n", "396 0.418382 \n", "397 0.527779 \n", "398 0.313693 \n", "399 0.670229 \n", "\n", " original_glszm_SmallAreaLowGrayLevelEmphasis original_glszm_ZoneEntropy \\\n", "395 0.476493 2.453583 \n", "396 0.418382 2.995603 \n", "397 0.527778 1.500000 \n", "398 0.313693 3.573557 \n", "399 0.670229 1.857045 \n", "\n", " original_glszm_ZonePercentage original_glszm_ZoneVariance \\\n", "395 0.003229 2.327038e+06 \n", "396 0.004243 1.005061e+06 \n", "397 0.003728 2.132007e+05 \n", "398 0.001112 2.008034e+07 \n", "399 0.006706 5.609262e+05 \n", "\n", " original_ngtdm_Busyness original_ngtdm_Coarseness \\\n", "395 18.562377 0.013766 \n", "396 156.627179 0.002228 \n", "397 0.996746 0.252582 \n", "398 204.864200 0.001372 \n", "399 9.609163 0.026591 \n", "\n", " original_ngtdm_Complexity original_ngtdm_Contrast \\\n", "395 0.018042 0.000288 \n", "396 0.136015 0.022148 \n", "397 0.007380 0.000037 \n", "398 0.054063 0.003697 \n", "399 0.018682 0.000311 \n", "\n", " original_ngtdm_Strength \n", "395 0.012257 \n", "396 0.002098 \n", "397 0.231059 \n", "398 0.001368 \n", "399 0.022676 \n", "\n", "[5 rows x 120 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_excel(\"TrainDataset2023.xls\")\n", "df.tail()" ] }, { "cell_type": "code", "execution_count": 3, "id": "6c971a55", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(400, 120)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 4, "id": "ba26513a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pCR (outcome)RelapseFreeSurvival (outcome)
count400.000000400.000000
mean12.69750056.000208
std111.10741727.137584
min0.0000000.000000
25%0.00000038.000000
50%0.00000055.000000
75%0.00000073.000000
max999.000000144.000000
\n", "
" ], "text/plain": [ " pCR (outcome) RelapseFreeSurvival (outcome)\n", "count 400.000000 400.000000\n", "mean 12.697500 56.000208\n", "std 111.107417 27.137584\n", "min 0.000000 0.000000\n", "25% 0.000000 38.000000\n", "50% 0.000000 55.000000\n", "75% 0.000000 73.000000\n", "max 999.000000 144.000000" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[:,0:3].describe()" ] }, { "cell_type": "code", "execution_count": 5, "id": "00865ac9", "metadata": {}, "outputs": [], "source": [ "# replace the 999 values with None, to make it easier for data imputation.\n", "df=df.replace(999, None)" ] }, { "cell_type": "code", "execution_count": 6, "id": "c6497858", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['ID', 'pCR (outcome)', 'RelapseFreeSurvival (outcome)', 'Age', 'ER',\n", " 'PgR', 'HER2', 'TrippleNegative', 'ChemoGrade', 'Proliferation',\n", " ...\n", " 'original_glszm_SmallAreaHighGrayLevelEmphasis',\n", " 'original_glszm_SmallAreaLowGrayLevelEmphasis',\n", " 'original_glszm_ZoneEntropy', 'original_glszm_ZonePercentage',\n", " 'original_glszm_ZoneVariance', 'original_ngtdm_Busyness',\n", " 'original_ngtdm_Coarseness', 'original_ngtdm_Complexity',\n", " 'original_ngtdm_Contrast', 'original_ngtdm_Strength'],\n", " dtype='object', length=120)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "code", "execution_count": 7, "id": "ed8600fe", "metadata": {}, "outputs": [], "source": [ "# the ID column is not needed.\n", "Df_ = df.iloc[:,1:13]" ] }, { "cell_type": "code", "execution_count": 8, "id": "093b41d1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pCR (outcome)RelapseFreeSurvival (outcome)AgeERPgRHER2TrippleNegativeChemoGradeProliferationHistologyTypeLNStatusTumourStage
01144.00000041.0000133112
10142.00000039.0110033112
21135.00000031.0000121102
3012.00000035.0000133113
40109.00000061.0100021102
.......................................
395054.50000058.5101032114
396049.25000034.3000133102
397048.50000053.3000121102
398047.50000068.8100033113
399046.91666746.0100021112
\n", "

400 rows × 12 columns

\n", "
" ], "text/plain": [ " pCR (outcome) RelapseFreeSurvival (outcome) Age ER PgR HER2 \\\n", "0 1 144.000000 41.0 0 0 0 \n", "1 0 142.000000 39.0 1 1 0 \n", "2 1 135.000000 31.0 0 0 0 \n", "3 0 12.000000 35.0 0 0 0 \n", "4 0 109.000000 61.0 1 0 0 \n", ".. ... ... ... .. .. ... \n", "395 0 54.500000 58.5 1 0 1 \n", "396 0 49.250000 34.3 0 0 0 \n", "397 0 48.500000 53.3 0 0 0 \n", "398 0 47.500000 68.8 1 0 0 \n", "399 0 46.916667 46.0 1 0 0 \n", "\n", " TrippleNegative ChemoGrade Proliferation HistologyType LNStatus \\\n", "0 1 3 3 1 1 \n", "1 0 3 3 1 1 \n", "2 1 2 1 1 0 \n", "3 1 3 3 1 1 \n", "4 0 2 1 1 0 \n", ".. ... ... ... ... ... \n", "395 0 3 2 1 1 \n", "396 1 3 3 1 0 \n", "397 1 2 1 1 0 \n", "398 0 3 3 1 1 \n", "399 0 2 1 1 1 \n", "\n", " TumourStage \n", "0 2 \n", "1 2 \n", "2 2 \n", "3 3 \n", "4 2 \n", ".. ... \n", "395 4 \n", "396 2 \n", "397 2 \n", "398 3 \n", "399 2 \n", "\n", "[400 rows x 12 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Df_" ] }, { "cell_type": "code", "execution_count": 9, "id": "b21e4eab", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Count\n", "pCR (outcome) 395\n", "RelapseFreeSurvival (outcome) 400\n", "Age 400\n", "ER 400\n", "PgR 399\n", "HER2 399\n", "TrippleNegative 399\n", "ChemoGrade 397\n", "Proliferation 398\n", "HistologyType 397\n", "LNStatus 399\n", "TumourStage 400\n" ] }, { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "marker": { "color": "lightgrey", "line": { "color": "#000000", "width": 1.5 } }, "opacity": 0.8, "type": "bar", "x": [ "pCR (outcome)", "RelapseFreeSurvival (outcome)", "Age", "ER", "PgR", "HER2", "TrippleNegative", "ChemoGrade", "Proliferation", "HistologyType", "LNStatus", "TumourStage" ], "y": [ 395, 400, 400, 400, 399, 399, 399, 397, 398, 397, 399, 400 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "fillpattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Missing Values in first 12 columns" } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# the first 12 columns have missing values. Vizualizing them.\n", "\n", "null_feat = pd.DataFrame(len(Df_['pCR (outcome)']) - Df_.isnull().sum(), columns = ['Count'])\n", "print(null_feat)\n", "trace = go.Bar(x = null_feat.index, y = null_feat['Count'] ,opacity = 0.8, marker=dict(color = 'lightgrey',\n", " line=dict(color='#000000',width=1.5)))\n", "\n", "layout = dict(title = \"Missing Values in first 12 columns\")\n", " \n", "fig = dict(data = [trace], layout=layout)\n", "py.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 10, "id": "2e3df9c1", "metadata": {}, "outputs": [], "source": [ "# Data imputation using mode\n", "\n", "columns = Df_.columns\n", "for col in Df_.columns:\n", " Df_[col].fillna(Df_[col].mode()[0], inplace=True)\n", " " ] }, { "cell_type": "code", "execution_count": 11, "id": "3d274755", "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "marker": { "color": "lightgrey", "line": { "color": "#000000", "width": 1.5 } }, "opacity": 0.8, "type": "bar", "x": [ "pCR (outcome)", "RelapseFreeSurvival (outcome)", "Age", "ER", "PgR", "HER2", "TrippleNegative", "ChemoGrade", "Proliferation", "HistologyType", "LNStatus", "TumourStage" ], "y": [ 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "fillpattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "After treating missing values" } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# vizualizing null values for the first 12 columns after imputation\n", "\n", "null_feat = pd.DataFrame(len(Df_['pCR (outcome)']) - Df_.isnull().sum(), columns = ['Count'])\n", "\n", "trace = go.Bar(x = null_feat.index, y = null_feat['Count'] ,opacity = 0.8, marker=dict(color = 'lightgrey',\n", " line=dict(color='#000000',width=1.5)))\n", "\n", "layout = dict(title = \"After treating missing values\")\n", " \n", "fig = dict(data = [trace], layout=layout)\n", "py.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 12, "id": "7a361367", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pCR (outcome)RelapseFreeSurvival (outcome)AgeERPgRHER2TrippleNegativeChemoGradeProliferationHistologyTypeLNStatusTumourStage
count400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.000000
mean0.21000056.00020851.8046740.5475000.4050000.3000000.3325002.3975001.5725001.1475000.5350002.607500
std0.40781827.13758410.9485220.4983620.4915070.4588310.4716990.5001190.7656430.3550480.4993980.897473
min0.0000000.00000023.0000000.0000000.0000000.0000000.0000001.0000001.0000001.0000000.0000001.000000
25%0.00000038.00000044.5167690.0000000.0000000.0000000.0000002.0000001.0000001.0000000.0000002.000000
50%0.00000055.00000051.0195071.0000000.0000000.0000000.0000002.0000001.0000001.0000001.0000002.000000
75%0.00000073.00000060.0000001.0000001.0000001.0000001.0000003.0000002.0000001.0000001.0000003.000000
max1.000000144.00000079.6030121.0000001.0000001.0000001.0000003.0000003.0000002.0000001.0000004.000000
\n", "
" ], "text/plain": [ " pCR (outcome) RelapseFreeSurvival (outcome) Age ER \\\n", "count 400.000000 400.000000 400.000000 400.000000 \n", "mean 0.210000 56.000208 51.804674 0.547500 \n", "std 0.407818 27.137584 10.948522 0.498362 \n", "min 0.000000 0.000000 23.000000 0.000000 \n", "25% 0.000000 38.000000 44.516769 0.000000 \n", "50% 0.000000 55.000000 51.019507 1.000000 \n", "75% 0.000000 73.000000 60.000000 1.000000 \n", "max 1.000000 144.000000 79.603012 1.000000 \n", "\n", " PgR HER2 TrippleNegative ChemoGrade Proliferation \\\n", "count 400.000000 400.000000 400.000000 400.000000 400.000000 \n", "mean 0.405000 0.300000 0.332500 2.397500 1.572500 \n", "std 0.491507 0.458831 0.471699 0.500119 0.765643 \n", "min 0.000000 0.000000 0.000000 1.000000 1.000000 \n", "25% 0.000000 0.000000 0.000000 2.000000 1.000000 \n", "50% 0.000000 0.000000 0.000000 2.000000 1.000000 \n", "75% 1.000000 1.000000 1.000000 3.000000 2.000000 \n", "max 1.000000 1.000000 1.000000 3.000000 3.000000 \n", "\n", " HistologyType LNStatus TumourStage \n", "count 400.000000 400.000000 400.000000 \n", "mean 1.147500 0.535000 2.607500 \n", "std 0.355048 0.499398 0.897473 \n", "min 1.000000 0.000000 1.000000 \n", "25% 1.000000 0.000000 2.000000 \n", "50% 1.000000 1.000000 2.000000 \n", "75% 1.000000 1.000000 3.000000 \n", "max 2.000000 1.000000 4.000000 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Df_.describe()" ] }, { "cell_type": "code", "execution_count": 13, "id": "73fc9ce6", "metadata": {}, "outputs": [], "source": [ "# data after handling missing values\n", "\n", "Df_imputed = pd.concat((Df_, df.iloc[:,13:]), axis=1)" ] }, { "cell_type": "code", "execution_count": 14, "id": "5bf4088c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pCR (outcome)RelapseFreeSurvival (outcome)AgeERPgRHER2TrippleNegativeChemoGradeProliferationHistologyType...original_glszm_SmallAreaHighGrayLevelEmphasisoriginal_glszm_SmallAreaLowGrayLevelEmphasisoriginal_glszm_ZoneEntropyoriginal_glszm_ZonePercentageoriginal_glszm_ZoneVarianceoriginal_ngtdm_Busynessoriginal_ngtdm_Coarsenessoriginal_ngtdm_Complexityoriginal_ngtdm_Contrastoriginal_ngtdm_Strength
count400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.000000...4.000000e+024.000000e+024.000000e+02400.0000004.000000e+02400.000000400.000000400.000000400.000000400.000000
mean0.21000056.00020851.8046740.5475000.4050000.3000000.3325002.3975001.5725001.147500...3.957637e-013.911005e-012.722189e+000.0033475.679717e+07178.31124632500.0326200.0569350.0059650.029322
std0.40781827.13758410.9485220.4983620.4915070.4588310.4716990.5001190.7656430.355048...1.666319e-011.615922e-017.648849e-010.0024197.063846e+081045.453432177545.9215680.0471790.0083790.115915
min0.0000000.00000023.0000000.0000000.0000000.0000000.0000001.0000001.0000001.000000...7.050000e-117.050000e-11-3.200000e-160.0000080.000000e+000.0000000.0002480.0000000.0000000.000000
25%0.00000038.00000044.5167690.0000000.0000000.0000000.0000002.0000001.0000001.000000...3.199017e-013.184398e-012.340783e+000.0013891.030473e+0618.7605700.0018260.0186280.0003100.001464
50%0.00000055.00000051.0195071.0000000.0000000.0000000.0000002.0000001.0000001.000000...4.095627e-014.054695e-012.814884e+000.0029443.277334e+0667.9296590.0043830.0477400.0023300.003276
75%0.00000073.00000060.0000001.0000001.0000001.0000001.0000003.0000002.0000001.000000...5.000049e-014.956920e-013.304411e+000.0047989.079686e+06157.3702940.0137690.0853210.0079620.009479
max1.000000144.00000079.6030121.0000001.0000001.0000001.0000003.0000003.0000002.000000...8.773779e-018.571429e-014.947427e+000.0113011.390001e+1020764.6937901000000.0000000.2851000.0607421.145601
\n", "

8 rows × 119 columns

\n", "
" ], "text/plain": [ " pCR (outcome) RelapseFreeSurvival (outcome) Age ER \\\n", "count 400.000000 400.000000 400.000000 400.000000 \n", "mean 0.210000 56.000208 51.804674 0.547500 \n", "std 0.407818 27.137584 10.948522 0.498362 \n", "min 0.000000 0.000000 23.000000 0.000000 \n", "25% 0.000000 38.000000 44.516769 0.000000 \n", "50% 0.000000 55.000000 51.019507 1.000000 \n", "75% 0.000000 73.000000 60.000000 1.000000 \n", "max 1.000000 144.000000 79.603012 1.000000 \n", "\n", " PgR HER2 TrippleNegative ChemoGrade Proliferation \\\n", "count 400.000000 400.000000 400.000000 400.000000 400.000000 \n", "mean 0.405000 0.300000 0.332500 2.397500 1.572500 \n", "std 0.491507 0.458831 0.471699 0.500119 0.765643 \n", "min 0.000000 0.000000 0.000000 1.000000 1.000000 \n", "25% 0.000000 0.000000 0.000000 2.000000 1.000000 \n", "50% 0.000000 0.000000 0.000000 2.000000 1.000000 \n", "75% 1.000000 1.000000 1.000000 3.000000 2.000000 \n", "max 1.000000 1.000000 1.000000 3.000000 3.000000 \n", "\n", " HistologyType ... original_glszm_SmallAreaHighGrayLevelEmphasis \\\n", "count 400.000000 ... 4.000000e+02 \n", "mean 1.147500 ... 3.957637e-01 \n", "std 0.355048 ... 1.666319e-01 \n", "min 1.000000 ... 7.050000e-11 \n", "25% 1.000000 ... 3.199017e-01 \n", "50% 1.000000 ... 4.095627e-01 \n", "75% 1.000000 ... 5.000049e-01 \n", "max 2.000000 ... 8.773779e-01 \n", "\n", " original_glszm_SmallAreaLowGrayLevelEmphasis \\\n", "count 4.000000e+02 \n", "mean 3.911005e-01 \n", "std 1.615922e-01 \n", "min 7.050000e-11 \n", "25% 3.184398e-01 \n", "50% 4.054695e-01 \n", "75% 4.956920e-01 \n", "max 8.571429e-01 \n", "\n", " original_glszm_ZoneEntropy original_glszm_ZonePercentage \\\n", "count 4.000000e+02 400.000000 \n", "mean 2.722189e+00 0.003347 \n", "std 7.648849e-01 0.002419 \n", "min -3.200000e-16 0.000008 \n", "25% 2.340783e+00 0.001389 \n", "50% 2.814884e+00 0.002944 \n", "75% 3.304411e+00 0.004798 \n", "max 4.947427e+00 0.011301 \n", "\n", " original_glszm_ZoneVariance original_ngtdm_Busyness \\\n", "count 4.000000e+02 400.000000 \n", "mean 5.679717e+07 178.311246 \n", "std 7.063846e+08 1045.453432 \n", "min 0.000000e+00 0.000000 \n", "25% 1.030473e+06 18.760570 \n", "50% 3.277334e+06 67.929659 \n", "75% 9.079686e+06 157.370294 \n", "max 1.390001e+10 20764.693790 \n", "\n", " original_ngtdm_Coarseness original_ngtdm_Complexity \\\n", "count 400.000000 400.000000 \n", "mean 32500.032620 0.056935 \n", "std 177545.921568 0.047179 \n", "min 0.000248 0.000000 \n", "25% 0.001826 0.018628 \n", "50% 0.004383 0.047740 \n", "75% 0.013769 0.085321 \n", "max 1000000.000000 0.285100 \n", "\n", " original_ngtdm_Contrast original_ngtdm_Strength \n", "count 400.000000 400.000000 \n", "mean 0.005965 0.029322 \n", "std 0.008379 0.115915 \n", "min 0.000000 0.000000 \n", "25% 0.000310 0.001464 \n", "50% 0.002330 0.003276 \n", "75% 0.007962 0.009479 \n", "max 0.060742 1.145601 \n", "\n", "[8 rows x 119 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Df_imputed.describe()" ] }, { "cell_type": "code", "execution_count": 15, "id": "17f5a2c5", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Age is the only continuous feature in the clinical features, drawing a boxplot reveals that it has no outliers\n", "\n", "plt.boxplot(Df_imputed['Age'])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 16, "id": "4bb89753", "metadata": {}, "outputs": [], "source": [ "#drop RFS column which is not needed for classification\n", "\n", "Df_imputed = Df_imputed.drop(\"RelapseFreeSurvival (outcome)\", axis = 1)" ] }, { "cell_type": "markdown", "id": "d495fdb5", "metadata": {}, "source": [ "### PCA" ] }, { "cell_type": "markdown", "id": "2a8a6bb5", "metadata": {}, "source": [ "**Here are the steps followed for performing PCA:**\n", "\n", "1. Perform one-hot encoding to transform categorical data set to numerical data set\n", "2. Perform training / test split of the dataset\n", "3. Standardize the training and test data set\n", "4. Construct covariance matrix of the training data set\n", "5. Construct eigendecomposition of the covariance matrix\n", "6. Select the most important features using explained variance\n", "7. Construct project matrix; In the code below, the projection matrix is created using the five eigenvectors that correspond to the top five eigenvalues (largest), to capture about 75% of the variance in this dataset\n", "8. Transform the training data set into new feature subspace" ] }, { "cell_type": "code", "execution_count": 17, "id": "863e0896", "metadata": {}, "outputs": [], "source": [ "# only column 11 onwards taken for pca (MRI features)\n", "\n", "Df_forPCA = Df_imputed.iloc[:,11:]" ] }, { "cell_type": "code", "execution_count": 18, "id": "422c57e8", "metadata": {}, "outputs": [], "source": [ "X_forPCA = Df_forPCA\n", "Y_forPCA = Df_imputed[[\"pCR (outcome)\"]]" ] }, { "cell_type": "code", "execution_count": 19, "id": "be45d0c0", "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", "from sklearn.preprocessing import StandardScaler\n", "from itertools import chain" ] }, { "cell_type": "code", "execution_count": 20, "id": "f1b05fcc", "metadata": {}, "outputs": [], "source": [ "# calcualting the explained variance gives us an idea of how many components to select after PCA\n", "#To make a PCA, normalize data is essential\n", "\n", "X_pca = X_forPCA.values\n", "X_std = StandardScaler().fit_transform(X_pca)\n", "\n", "pca = PCA(svd_solver='full')\n", "pca_std = pca.fit(X_std, Y_forPCA).transform(X_std)\n", "\n", "pca_std = pd.DataFrame(pca_std)\n", "pca_std = pca_std.merge(Y_forPCA, left_index = True, right_index = True, how = 'left')\n", "pca_std['pCR (outcome)'] = pca_std['pCR (outcome)'].replace({1:'cancer',0:'no cancer'})\n", "\n" ] }, { "cell_type": "code", "execution_count": 21, "id": "6ed2f471", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8185716214347369" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Explained_variance\n", "\n", "var_pca = pd.DataFrame(pca.explained_variance_ratio_)\n", "var_pca[0:6].values.sum()" ] }, { "cell_type": "code", "execution_count": 22, "id": "bcab9637", "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "labels": [ "COMP1", "COMP2", "COMP3", "COMP4", "COMP5", "COMP6", "COMP7", "COMP8", "COMP9", "COMP10", "COMP11to180" ], "marker": { "colors": [ "gold", "lightgreen", "lightcoral", "lightskyblue", "lightgrey", "orange", "pink", "cyan", "canary", "copper", "white" ], "line": { "color": "#000000", "width": 1.5 } }, "opacity": 0.8, "textfont": { "size": 15 }, "type": "pie", "values": [ 0.32603751299896166, 0.14592155630733172, 0.12252538301011849, 0.11409695341328258, 0.06188648368782481, 0.04810373201721772, 0.031325757952552664, 0.02147647926296561, 0.019818424757448185, 0.012959617492838974, 0.011274718489094194, 0.010795073898998695, 0.009805684852752825, 0.008464766608794685, 0.007078154476371388, 0.006440802476368531, 0.005943995333924142, 0.005551933464098026, 0.004157704472157613, 0.002856395431375207, 0.0026942998042132826, 0.0020857241391290977, 0.001988836161822876, 0.001732939369128958, 0.0016165929792007147, 0.0014106408206738651, 0.0012321787806290855, 0.0012221888142771888, 0.0011361270021124769, 0.000911519552879634, 0.0008626465457597014, 0.0008293003702345042, 0.0007415022809925032, 0.0006137010037896548, 0.000530025081630745, 0.0004700031013603673, 0.00042399805923927194, 0.00039205843809919064, 0.0003402001079696486, 0.0002661543786343279, 0.00024393459543110895, 0.0002174774116037788, 0.0002127195978171193, 0.0001523954335038541, 0.00014481910575382223, 0.00013270301117282882, 0.00012040136654088114, 0.00010684493046257326, 9.270661367931435e-05, 8.324072900199455e-05, 7.562151070157905e-05, 6.729662015770958e-05, 5.654389418921107e-05, 5.305823810833024e-05, 4.799178837210016e-05, 4.048327360538662e-05, 2.8612874972037562e-05, 2.2795621280844056e-05, 1.2936543194807303e-05, 1.1204766204883389e-05, 9.355738428024276e-06, 7.323851066832281e-06, 6.270076119377671e-06, 5.814251737803547e-06, 4.709956842671216e-06, 3.936749543640484e-06, 3.4525737388862696e-06, 3.1370627235821606e-06, 2.525215457880237e-06, 2.2885566465403673e-06, 1.8216811474584748e-06, 1.191796452789283e-06, 9.085664619873523e-07, 7.303275694678878e-07, 4.1346551861368644e-07, 2.2374027575206982e-07, 1.6726611581640405e-07, 7.499274528080498e-08, 5.501767296650091e-08, 3.223878125367407e-08, 7.681846369698657e-09, 4.071097900268571e-09, 1.2575641905636173e-16, 1.0755458242960771e-17, 5.757258194417991e-18, 3.0267222257809327e-18, 1.5432619001759964e-18, 1.2480172550375865e-18, 7.668332477445324e-20, 6.952296984024896e-20, 5.94224410510818e-20, 4.3744756737723134e-20, 2.2432855028630687e-20, 2.1037582426352204e-20, 1.7440312969369985e-20, 1.4655750305879558e-20, 1.2776828274015757e-20, 7.036419053976353e-21, 3.0995819226922338e-21, 2.8569361061001943e-21, 6.680798288599692e-22, 4.925320269652189e-22, 1.8647602225646557e-25, 1.3755011360604957e-33, 1.3755011360604957e-33, 1.3755011360604957e-33, 1.3755008506017996e-33 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "fillpattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "PCA : components and explained variance" } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Pie chart to vizualize contribution of each component\n", "\n", "labels = ['COMP1','COMP2','COMP3','COMP4','COMP5','COMP6','COMP7','COMP8','COMP9','COMP10','COMP11to180']\n", "colors = ['gold', 'lightgreen', 'lightcoral', 'lightskyblue', 'lightgrey', 'orange', 'pink', 'cyan', 'canary', 'copper', 'white']\n", "\n", "trace = go.Pie(labels = labels, values = var_pca[0].values, opacity = 0.8,\n", " textfont=dict(size=15),\n", " marker=dict(colors=colors, \n", " line=dict(color='#000000', width=1.5)))\n", "\n", "\n", "layout = dict(title = 'PCA : components and explained variance')\n", " \n", " \n", "fig = dict(data = [trace], layout=layout)\n", "py.iplot(fig)" ] }, { "cell_type": "code", "execution_count": null, "id": "fe9b3f37", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 23, "id": "e68976e6", "metadata": {}, "outputs": [], "source": [ "# we will take the first 6 components as they are responsible for 0.81 of the variance.\n", "# Standardize the features\n", "scaler = StandardScaler()\n", "X_std = scaler.fit_transform(X_forPCA)\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "6efebf64", "metadata": {}, "outputs": [], "source": [ "# Perform PCA\n", "pca = PCA(n_components=6) # Reduce to 6 principal components\n", "\n", "Y_pca = pd.DataFrame(Y_forPCA)\n", "X_pca = pca.fit_transform(X_std)" ] }, { "cell_type": "code", "execution_count": 25, "id": "28a674f1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8185716214347369" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(pca.explained_variance_ratio_)" ] }, { "cell_type": "code", "execution_count": 26, "id": "a65f6496", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pCR (outcome)AgeERPgRHER2TrippleNegativeChemoGradeProliferationHistologyTypeLNStatusTumourStage012345
0141.000013311212.5961871.4764900.8353223.5726523.6808441.588340
1039.0110033112-3.402929-1.950037-1.8252971.893426-1.3925850.782602
2131.0000121102-3.831085-2.983421-2.1576452.095281-0.6864642.161302
3035.0000133113-6.312342-1.559572-1.8190642.526369-0.7933460.518627
4061.0100021102-2.275703-3.647228-2.0089273.357714-0.8701792.050370
......................................................
395058.5101032114-4.421618-3.627292-0.8077340.4311160.4010400.350318
396034.300013310210.7657701.8860690.430573-0.233894-0.3286210.065289
397053.3000121102-2.991509-5.684329-0.548646-6.0531063.5337111.468567
398068.81000331130.3713240.7990430.4456635.7959880.859414-2.789871
399046.0100021112-2.744353-4.132731-1.735130-4.106370-0.7574452.916702
\n", "

400 rows × 17 columns

\n", "
" ], "text/plain": [ " pCR (outcome) Age ER PgR HER2 TrippleNegative ChemoGrade \n", "0 1 41.0 0 0 0 1 3 \\\n", "1 0 39.0 1 1 0 0 3 \n", "2 1 31.0 0 0 0 1 2 \n", "3 0 35.0 0 0 0 1 3 \n", "4 0 61.0 1 0 0 0 2 \n", ".. ... ... .. ... ... ... ... \n", "395 0 58.5 1 0 1 0 3 \n", "396 0 34.3 0 0 0 1 3 \n", "397 0 53.3 0 0 0 1 2 \n", "398 0 68.8 1 0 0 0 3 \n", "399 0 46.0 1 0 0 0 2 \n", "\n", " Proliferation HistologyType LNStatus TumourStage 0 1 \n", "0 3 1 1 2 12.596187 1.476490 \\\n", "1 3 1 1 2 -3.402929 -1.950037 \n", "2 1 1 0 2 -3.831085 -2.983421 \n", "3 3 1 1 3 -6.312342 -1.559572 \n", "4 1 1 0 2 -2.275703 -3.647228 \n", ".. ... ... ... ... ... ... \n", "395 2 1 1 4 -4.421618 -3.627292 \n", "396 3 1 0 2 10.765770 1.886069 \n", "397 1 1 0 2 -2.991509 -5.684329 \n", "398 3 1 1 3 0.371324 0.799043 \n", "399 1 1 1 2 -2.744353 -4.132731 \n", "\n", " 2 3 4 5 \n", "0 0.835322 3.572652 3.680844 1.588340 \n", "1 -1.825297 1.893426 -1.392585 0.782602 \n", "2 -2.157645 2.095281 -0.686464 2.161302 \n", "3 -1.819064 2.526369 -0.793346 0.518627 \n", "4 -2.008927 3.357714 -0.870179 2.050370 \n", ".. ... ... ... ... \n", "395 -0.807734 0.431116 0.401040 0.350318 \n", "396 0.430573 -0.233894 -0.328621 0.065289 \n", "397 -0.548646 -6.053106 3.533711 1.468567 \n", "398 0.445663 5.795988 0.859414 -2.789871 \n", "399 -1.735130 -4.106370 -0.757445 2.916702 \n", "\n", "[400 rows x 17 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_pca = pd.DataFrame(X_pca)\n", "Df_afterPCA = Df_imputed.iloc[:,0:11].merge(X_pca, left_index = True, right_index = True, how = 'right')\n", "Df_afterPCA" ] }, { "cell_type": "code", "execution_count": 27, "id": "d7e18c8d", "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pCR (outcome)AgeERPgRHER2TrippleNegativeChemoGradeProliferationHistologyTypeLNStatusTumourStageCOMP0COMP1COMP2COMP3COMP4COMP5
0141.000013311212.5961871.4764900.8353223.5726523.6808441.588340
1039.0110033112-3.402929-1.950037-1.8252971.893426-1.3925850.782602
2131.0000121102-3.831085-2.983421-2.1576452.095281-0.6864642.161302
3035.0000133113-6.312342-1.559572-1.8190642.526369-0.7933460.518627
4061.0100021102-2.275703-3.647228-2.0089273.357714-0.8701792.050370
......................................................
395058.5101032114-4.421618-3.627292-0.8077340.4311160.4010400.350318
396034.300013310210.7657701.8860690.430573-0.233894-0.3286210.065289
397053.3000121102-2.991509-5.684329-0.548646-6.0531063.5337111.468567
398068.81000331130.3713240.7990430.4456635.7959880.859414-2.789871
399046.0100021112-2.744353-4.132731-1.735130-4.106370-0.7574452.916702
\n", "

400 rows × 17 columns

\n", "
" ], "text/plain": [ " pCR (outcome) Age ER PgR HER2 TrippleNegative ChemoGrade \n", "0 1 41.0 0 0 0 1 3 \\\n", "1 0 39.0 1 1 0 0 3 \n", "2 1 31.0 0 0 0 1 2 \n", "3 0 35.0 0 0 0 1 3 \n", "4 0 61.0 1 0 0 0 2 \n", ".. ... ... .. ... ... ... ... \n", "395 0 58.5 1 0 1 0 3 \n", "396 0 34.3 0 0 0 1 3 \n", "397 0 53.3 0 0 0 1 2 \n", "398 0 68.8 1 0 0 0 3 \n", "399 0 46.0 1 0 0 0 2 \n", "\n", " Proliferation HistologyType LNStatus TumourStage COMP0 COMP1 \n", "0 3 1 1 2 12.596187 1.476490 \\\n", "1 3 1 1 2 -3.402929 -1.950037 \n", "2 1 1 0 2 -3.831085 -2.983421 \n", "3 3 1 1 3 -6.312342 -1.559572 \n", "4 1 1 0 2 -2.275703 -3.647228 \n", ".. ... ... ... ... ... ... \n", "395 2 1 1 4 -4.421618 -3.627292 \n", "396 3 1 0 2 10.765770 1.886069 \n", "397 1 1 0 2 -2.991509 -5.684329 \n", "398 3 1 1 3 0.371324 0.799043 \n", "399 1 1 1 2 -2.744353 -4.132731 \n", "\n", " COMP2 COMP3 COMP4 COMP5 \n", "0 0.835322 3.572652 3.680844 1.588340 \n", "1 -1.825297 1.893426 -1.392585 0.782602 \n", "2 -2.157645 2.095281 -0.686464 2.161302 \n", "3 -1.819064 2.526369 -0.793346 0.518627 \n", "4 -2.008927 3.357714 -0.870179 2.050370 \n", ".. ... ... ... ... \n", "395 -0.807734 0.431116 0.401040 0.350318 \n", "396 0.430573 -0.233894 -0.328621 0.065289 \n", "397 -0.548646 -6.053106 3.533711 1.468567 \n", "398 0.445663 5.795988 0.859414 -2.789871 \n", "399 -1.735130 -4.106370 -0.757445 2.916702 \n", "\n", "[400 rows x 17 columns]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_column_names = {0: 'COMP0', 1: 'COMP1', 2: 'COMP2',3: 'COMP3',4:'COMP4',5:'COMP5'}\n", "Df_afterPCA = Df_afterPCA.rename(columns=new_column_names)\n", "Df_afterPCA" ] }, { "cell_type": "code", "execution_count": 28, "id": "06da5229", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pCR (outcome)AgeERPgRHER2TrippleNegativeChemoGradeProliferationHistologyTypeLNStatusTumourStageCOMP0COMP1COMP2COMP3COMP4COMP5
count400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.0000004.000000e+024.000000e+024.000000e+024.000000e+024.000000e+024.000000e+02
mean0.21000051.8046740.5475000.4050000.3000000.3325002.3975001.5725001.1475000.5350002.6075001.421085e-16-3.552714e-17-8.437695e-17-1.065814e-161.776357e-17-3.552714e-17
std0.40781810.9485220.4983620.4915070.4588310.4716990.5001190.7656430.3550480.4993980.8974735.913835e+003.956354e+003.625338e+003.498424e+002.576519e+002.271563e+00
min0.00000023.0000000.0000000.0000000.0000000.0000001.0000001.0000001.0000000.0000001.000000-2.137126e+01-7.155386e+00-3.817510e+00-1.234668e+01-6.612577e+00-1.025440e+01
25%0.00000044.5167690.0000000.0000000.0000000.0000002.0000001.0000001.0000000.0000002.000000-4.380918e+00-2.300239e+00-1.501204e+00-2.016138e+00-1.603999e+00-1.087117e+00
50%0.00000051.0195071.0000000.0000000.0000000.0000002.0000001.0000001.0000001.0000002.000000-6.291532e-01-4.220578e-01-7.452754e-013.711719e-01-2.808058e-01-3.972333e-02
75%0.00000060.0000001.0000001.0000001.0000001.0000003.0000002.0000001.0000001.0000003.0000003.844593e+001.489531e+001.908735e-012.303221e+001.190874e+001.120237e+00
max1.00000079.6030121.0000001.0000001.0000001.0000003.0000003.0000002.0000001.0000004.0000002.246977e+014.495662e+012.031779e+011.283187e+011.878213e+012.207831e+01
\n", "
" ], "text/plain": [ " pCR (outcome) Age ER PgR HER2 \n", "count 400.000000 400.000000 400.000000 400.000000 400.000000 \\\n", "mean 0.210000 51.804674 0.547500 0.405000 0.300000 \n", "std 0.407818 10.948522 0.498362 0.491507 0.458831 \n", "min 0.000000 23.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 44.516769 0.000000 0.000000 0.000000 \n", "50% 0.000000 51.019507 1.000000 0.000000 0.000000 \n", "75% 0.000000 60.000000 1.000000 1.000000 1.000000 \n", "max 1.000000 79.603012 1.000000 1.000000 1.000000 \n", "\n", " TrippleNegative ChemoGrade Proliferation HistologyType LNStatus \n", "count 400.000000 400.000000 400.000000 400.000000 400.000000 \\\n", "mean 0.332500 2.397500 1.572500 1.147500 0.535000 \n", "std 0.471699 0.500119 0.765643 0.355048 0.499398 \n", "min 0.000000 1.000000 1.000000 1.000000 0.000000 \n", "25% 0.000000 2.000000 1.000000 1.000000 0.000000 \n", "50% 0.000000 2.000000 1.000000 1.000000 1.000000 \n", "75% 1.000000 3.000000 2.000000 1.000000 1.000000 \n", "max 1.000000 3.000000 3.000000 2.000000 1.000000 \n", "\n", " TumourStage COMP0 COMP1 COMP2 COMP3 \n", "count 400.000000 4.000000e+02 4.000000e+02 4.000000e+02 4.000000e+02 \\\n", "mean 2.607500 1.421085e-16 -3.552714e-17 -8.437695e-17 -1.065814e-16 \n", "std 0.897473 5.913835e+00 3.956354e+00 3.625338e+00 3.498424e+00 \n", "min 1.000000 -2.137126e+01 -7.155386e+00 -3.817510e+00 -1.234668e+01 \n", "25% 2.000000 -4.380918e+00 -2.300239e+00 -1.501204e+00 -2.016138e+00 \n", "50% 2.000000 -6.291532e-01 -4.220578e-01 -7.452754e-01 3.711719e-01 \n", "75% 3.000000 3.844593e+00 1.489531e+00 1.908735e-01 2.303221e+00 \n", "max 4.000000 2.246977e+01 4.495662e+01 2.031779e+01 1.283187e+01 \n", "\n", " COMP4 COMP5 \n", "count 4.000000e+02 4.000000e+02 \n", "mean 1.776357e-17 -3.552714e-17 \n", "std 2.576519e+00 2.271563e+00 \n", "min -6.612577e+00 -1.025440e+01 \n", "25% -1.603999e+00 -1.087117e+00 \n", "50% -2.808058e-01 -3.972333e-02 \n", "75% 1.190874e+00 1.120237e+00 \n", "max 1.878213e+01 2.207831e+01 " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Df_afterPCA.describe()" ] }, { "cell_type": "code", "execution_count": 29, "id": "20d42ae7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['pCR (outcome)', 'Age', 'ER', 'PgR', 'HER2', 'TrippleNegative',\n", " 'ChemoGrade', 'Proliferation', 'HistologyType', 'LNStatus',\n", " 'TumourStage', 'COMP0', 'COMP1', 'COMP2', 'COMP3', 'COMP4', 'COMP5'],\n", " dtype='object')" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Df_afterPCA.columns" ] }, { "cell_type": "code", "execution_count": 30, "id": "9a46fb8e", "metadata": {}, "outputs": [], "source": [ "# Age column needs to be standardized as the other clinical features are categiorical\n", "\n", "Df_afterPCA[['Age']] = StandardScaler().fit_transform(Df_afterPCA[['Age']])" ] }, { "cell_type": "code", "execution_count": 31, "id": "5827fee9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pCR (outcome)AgeERPgRHER2TrippleNegativeChemoGradeProliferationHistologyTypeLNStatusTumourStageCOMP0COMP1COMP2COMP3COMP4COMP5
count400.0000004.000000e+02400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.000000400.0000004.000000e+024.000000e+024.000000e+024.000000e+024.000000e+024.000000e+02
mean0.210000-9.325873e-170.5475000.4050000.3000000.3325002.3975001.5725001.1475000.5350002.6075001.421085e-16-3.552714e-17-8.437695e-17-1.065814e-161.776357e-17-3.552714e-17
std0.4078181.001252e+000.4983620.4915070.4588310.4716990.5001190.7656430.3550480.4993980.8974735.913835e+003.956354e+003.625338e+003.498424e+002.576519e+002.271563e+00
min0.000000-2.634214e+000.0000000.0000000.0000000.0000001.0000001.0000001.0000000.0000001.000000-2.137126e+01-7.155386e+00-3.817510e+00-1.234668e+01-6.612577e+00-1.025440e+01
25%0.000000-6.664855e-010.0000000.0000000.0000000.0000002.0000001.0000001.0000000.0000002.000000-4.380918e+00-2.300239e+00-1.501204e+00-2.016138e+00-1.603999e+00-1.087117e+00
50%0.000000-7.180426e-021.0000000.0000000.0000000.0000002.0000001.0000001.0000001.0000002.000000-6.291532e-01-4.220578e-01-7.452754e-013.711719e-01-2.808058e-01-3.972333e-02
75%0.0000007.494700e-011.0000001.0000001.0000001.0000003.0000002.0000001.0000001.0000003.0000003.844593e+001.489531e+001.908735e-012.303221e+001.190874e+001.120237e+00
max1.0000002.542183e+001.0000001.0000001.0000001.0000003.0000003.0000002.0000001.0000004.0000002.246977e+014.495662e+012.031779e+011.283187e+011.878213e+012.207831e+01
\n", "
" ], "text/plain": [ " pCR (outcome) Age ER PgR HER2 \n", "count 400.000000 4.000000e+02 400.000000 400.000000 400.000000 \\\n", "mean 0.210000 -9.325873e-17 0.547500 0.405000 0.300000 \n", "std 0.407818 1.001252e+00 0.498362 0.491507 0.458831 \n", "min 0.000000 -2.634214e+00 0.000000 0.000000 0.000000 \n", "25% 0.000000 -6.664855e-01 0.000000 0.000000 0.000000 \n", "50% 0.000000 -7.180426e-02 1.000000 0.000000 0.000000 \n", "75% 0.000000 7.494700e-01 1.000000 1.000000 1.000000 \n", "max 1.000000 2.542183e+00 1.000000 1.000000 1.000000 \n", "\n", " TrippleNegative ChemoGrade Proliferation HistologyType LNStatus \n", "count 400.000000 400.000000 400.000000 400.000000 400.000000 \\\n", "mean 0.332500 2.397500 1.572500 1.147500 0.535000 \n", "std 0.471699 0.500119 0.765643 0.355048 0.499398 \n", "min 0.000000 1.000000 1.000000 1.000000 0.000000 \n", "25% 0.000000 2.000000 1.000000 1.000000 0.000000 \n", "50% 0.000000 2.000000 1.000000 1.000000 1.000000 \n", "75% 1.000000 3.000000 2.000000 1.000000 1.000000 \n", "max 1.000000 3.000000 3.000000 2.000000 1.000000 \n", "\n", " TumourStage COMP0 COMP1 COMP2 COMP3 \n", "count 400.000000 4.000000e+02 4.000000e+02 4.000000e+02 4.000000e+02 \\\n", "mean 2.607500 1.421085e-16 -3.552714e-17 -8.437695e-17 -1.065814e-16 \n", "std 0.897473 5.913835e+00 3.956354e+00 3.625338e+00 3.498424e+00 \n", "min 1.000000 -2.137126e+01 -7.155386e+00 -3.817510e+00 -1.234668e+01 \n", "25% 2.000000 -4.380918e+00 -2.300239e+00 -1.501204e+00 -2.016138e+00 \n", "50% 2.000000 -6.291532e-01 -4.220578e-01 -7.452754e-01 3.711719e-01 \n", "75% 3.000000 3.844593e+00 1.489531e+00 1.908735e-01 2.303221e+00 \n", "max 4.000000 2.246977e+01 4.495662e+01 2.031779e+01 1.283187e+01 \n", "\n", " COMP4 COMP5 \n", "count 4.000000e+02 4.000000e+02 \n", "mean 1.776357e-17 -3.552714e-17 \n", "std 2.576519e+00 2.271563e+00 \n", "min -6.612577e+00 -1.025440e+01 \n", "25% -1.603999e+00 -1.087117e+00 \n", "50% -2.808058e-01 -3.972333e-02 \n", "75% 1.190874e+00 1.120237e+00 \n", "max 1.878213e+01 2.207831e+01 " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Df_afterPCA.describe()" ] }, { "cell_type": "code", "execution_count": null, "id": "4400d37f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 32, "id": "a2cb8e4b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['pCR (outcome)', 'Age', 'ER', 'PgR', 'HER2', 'TrippleNegative',\n", " 'ChemoGrade', 'Proliferation', 'HistologyType', 'LNStatus',\n", " 'TumourStage', 'COMP0', 'COMP1', 'COMP2', 'COMP3', 'COMP4', 'COMP5'],\n", " dtype='object')" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Df_afterPCA.columns" ] }, { "cell_type": "code", "execution_count": 33, "id": "c10aff68", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Correlation matrix between the features.\n", "\n", "cor=Df_afterPCA.corr()\n", "plt.figure(figsize=(20, 10))\n", "sns.heatmap(cor, annot=True ,cmap='Reds')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 34, "id": "adc7d1cb", "metadata": {}, "outputs": [], "source": [ "# Proliferation dropped due to high correlation with other feature\n", "\n", "Df_afterPCA = Df_afterPCA.drop(columns=['Proliferation'])" ] }, { "cell_type": "code", "execution_count": 35, "id": "3db422fc", "metadata": {}, "outputs": [], "source": [ "X = Df_afterPCA.drop(\"pCR (outcome)\", axis = 1)\n", "Y = Df_afterPCA[[\"pCR (outcome)\"]]" ] }, { "cell_type": "code", "execution_count": 36, "id": "bbbf7c99", "metadata": {}, "outputs": [], "source": [ "# Before looking into oversampling we need to split the data into train and test\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1234, stratify = Y)\n", "#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)" ] }, { "cell_type": "code", "execution_count": 37, "id": "7deda9f6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pCR (outcome)\n", "0 221\n", "1 59\n", "Name: count, dtype: int64\n", "pCR (outcome)\n", "0 95\n", "1 25\n", "Name: count, dtype: int64\n" ] } ], "source": [ "print(y_train.value_counts())\n", "print(y_test.value_counts())" ] }, { "cell_type": "markdown", "id": "47880ba0", "metadata": {}, "source": [ "### Data Oversampling" ] }, { "cell_type": "markdown", "id": "06ed1e9d", "metadata": {}, "source": [ "**Our data is not balanced. We have very less instances of pCR =1.**" ] }, { "cell_type": "markdown", "id": "ecb3b38f", "metadata": {}, "source": [ "#### SMOTEEEN\n", "In order to balance the data we are first using a hybrid data augumentation technique called SMOTEEEN (SMOTE + ENN)\n", "SMOTE (Synthetic Minority Oversampling Technique), ENN (Ensemble Neural Networks)" ] }, { "cell_type": "code", "execution_count": 38, "id": "b5086f85", "metadata": {}, "outputs": [], "source": [ "# from imblearn.combine import SMOTEENN" ] }, { "cell_type": "code", "execution_count": 39, "id": "48e884d8", "metadata": {}, "outputs": [], "source": [ "# from collections import Counter" ] }, { "cell_type": "code", "execution_count": 40, "id": "98696447", "metadata": {}, "outputs": [], "source": [ "# counter = Y.value_counts()\n", "# print ('Before', counter)\n", "# # oversampling the train dataset using SMOTE + ENN\n", "# smenn = SMOTEENN()\n", "# X_smenn, Y_smenn = smenn.fit_resample(X_train, y_train)\n", "# counter = Y_smenn.value_counts()\n", "# print ('After',counter)" ] }, { "cell_type": "code", "execution_count": null, "id": "3e7d2480", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "cfa576e4", "metadata": {}, "source": [ "**Adaptive Synthetic Sampling Approach**" ] }, { "cell_type": "code", "execution_count": 41, "id": "81b6f872", "metadata": {}, "outputs": [], "source": [ "from imblearn.over_sampling import ADASYN" ] }, { "cell_type": "code", "execution_count": 42, "id": "a9a5a798", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Before pCR (outcome)\n", "0 221\n", "1 59\n", "Name: count, dtype: int64\n", "After pCR (outcome)\n", "1 231\n", "0 221\n", "Name: count, dtype: int64\n" ] } ], "source": [ "counter = y_train.value_counts()\n", "print ('Before', counter)\n", "# oversampling the train dataset using ADASYN\n", "ada = ADASYN(random_state=130, sampling_strategy='auto')\n", "X_ada, Y_ada = ada.fit_resample(X_train, y_train)\n", "counter = Y_ada.value_counts()\n", "print ('After', counter)" ] }, { "cell_type": "code", "execution_count": 43, "id": "cea8efc0", "metadata": {}, "outputs": [], "source": [ "X_train = X_ada\n", "y_train = Y_ada" ] }, { "cell_type": "code", "execution_count": null, "id": "9e2c8275", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "6c3f8218", "metadata": {}, "source": [ "#### Random OverSampling" ] }, { "cell_type": "code", "execution_count": 44, "id": "57eec054", "metadata": {}, "outputs": [], "source": [ "# from imblearn.over_sampling import RandomOverSampler" ] }, { "cell_type": "code", "execution_count": 45, "id": "3ec1b3d9", "metadata": {}, "outputs": [], "source": [ "# rndm_sampler = RandomOverSampler(sampling_strategy='minority')" ] }, { "cell_type": "code", "execution_count": 46, "id": "caf8bfe5", "metadata": {}, "outputs": [], "source": [ "# X_over, y_over = rndm_sampler.fit_resample(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 47, "id": "74509bbe", "metadata": {}, "outputs": [], "source": [ "# print(y_train.value_counts())\n", "# print(y_over.value_counts())" ] }, { "cell_type": "code", "execution_count": 48, "id": "6c0283ac", "metadata": {}, "outputs": [], "source": [ "# X_train = X_over\n", "# y_train = y_over" ] }, { "cell_type": "markdown", "id": "7262fa4e", "metadata": {}, "source": [ "The best results were achieved through Adaptive Synthetic Sampling Approach" ] }, { "cell_type": "code", "execution_count": null, "id": "1310b54e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "940e2187", "metadata": {}, "source": [ "## Modelling" ] }, { "cell_type": "markdown", "id": "7885a497", "metadata": {}, "source": [ "### Simple ANN" ] }, { "cell_type": "code", "execution_count": 102, "id": "91f1077c", "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf\n", "from tensorflow.keras import Sequential\n", "from tensorflow.keras.layers import Dense, Dropout" ] }, { "cell_type": "code", "execution_count": 244, "id": "fac960b1", "metadata": {}, "outputs": [], "source": [ "# Define a neural network model\n", "act_func = tf.keras.layers.LeakyReLU(alpha=0.1)\n", "model = Sequential([\n", " Dense(18, activation=act_func, input_shape=(15,)),\n", " Dense(22, activation=act_func),\n", " Dense(12, activation=act_func),\n", " Dense(6, activation=act_func),\n", " Dense(2, activation='softmax')\n", "])\n" ] }, { "cell_type": "code", "execution_count": 245, "id": "c2855f5d", "metadata": {}, "outputs": [], "source": [ "# Compile the model\n", "model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), \n", " loss=tf.losses.sparse_categorical_crossentropy, metrics=['accuracy'])" ] }, { "cell_type": "code", "execution_count": 246, "id": "73eb4813", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential_19\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " dense_107 (Dense) (None, 18) 288 \n", " \n", " dense_108 (Dense) (None, 22) 418 \n", " \n", " dense_109 (Dense) (None, 12) 276 \n", " \n", " dense_110 (Dense) (None, 6) 78 \n", " \n", " dense_111 (Dense) (None, 2) 14 \n", " \n", "=================================================================\n", "Total params: 1,074\n", "Trainable params: 1,074\n", "Non-trainable params: 0\n", "_________________________________________________________________\n" ] } ], "source": [ "model.summary()" ] }, { "cell_type": "code", "execution_count": 247, "id": "e829c87f", "metadata": {}, "outputs": [], "source": [ "# Train the model using PCA-transformed data\n", "traning = model.fit(X_train, y_train, epochs=200, batch_size=32, verbose=0)" ] }, { "cell_type": "code", "execution_count": 248, "id": "b5f955b4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4/4 [==============================] - 0s 2ms/step\n" ] } ], "source": [ "# Making predictions on test data\n", "y_pred = model.predict(X_test)\n", "# y_pred = np.round(y_pred).flatten() # Round predictions for binary classification" ] }, { "cell_type": "code", "execution_count": 249, "id": "83c4dcec", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0]\n" ] } ], "source": [ "# since softmax outpust probabilities, we take \n", "y_pred = [int(np.argmax(x)) for x in y_pred]\n", "print(y_pred)" ] }, { "cell_type": "code", "execution_count": 250, "id": "1c9d39d7", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import classification_report, confusion_matrix" ] }, { "cell_type": "code", "execution_count": 251, "id": "8c7085bd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.725\n" ] } ], "source": [ "# Calculating accuracy\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(f\"Accuracy: {accuracy}\")" ] }, { "cell_type": "code", "execution_count": 252, "id": "95c53d25", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.82 0.83 0.83 95\n", " 1 0.33 0.32 0.33 25\n", "\n", " accuracy 0.73 120\n", " macro avg 0.58 0.58 0.58 120\n", "weighted avg 0.72 0.72 0.72 120\n", "\n", "[[79 16]\n", " [17 8]]\n" ] } ], "source": [ "# Print classification report and confusion matrix\n", "print(classification_report(y_test, y_pred))\n", "print(confusion_matrix(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 253, "id": "6f2b337a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5757894736842105\n" ] } ], "source": [ "from sklearn.metrics import balanced_accuracy_score\n", "print(balanced_accuracy_score(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 254, "id": "1f31037d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Keras weights file () saving:\n", "...layers\\dense\n", "......vars\n", ".........0\n", ".........1\n", "...layers\\dense\\activation\n", "......vars\n", "...layers\\dense_1\n", "......vars\n", ".........0\n", ".........1\n", "...layers\\dense_2\n", "......vars\n", ".........0\n", ".........1\n", "...layers\\dense_3\n", "......vars\n", ".........0\n", ".........1\n", "...layers\\dense_4\n", "......vars\n", ".........0\n", ".........1\n", "...metrics\\mean\n", "......vars\n", ".........0\n", ".........1\n", "...metrics\\mean_metric_wrapper\n", "......vars\n", ".........0\n", ".........1\n", "...optimizer\n", "......vars\n", ".........0\n", ".........1\n", ".........10\n", ".........11\n", ".........12\n", ".........13\n", ".........14\n", ".........15\n", ".........16\n", ".........17\n", ".........18\n", ".........19\n", ".........2\n", ".........20\n", ".........3\n", ".........4\n", ".........5\n", ".........6\n", ".........7\n", ".........8\n", ".........9\n", "...vars\n", "Keras model archive saving:\n", "File Name Modified Size\n", "config.json 2023-12-17 22:08:39 3747\n", "metadata.json 2023-12-17 22:08:39 64\n", "variables.h5 2023-12-17 22:08:39 46608\n" ] }, { "data": { "text/plain": [ "['DeepLrng_0.57Bal_0.72Acc_0Imbalance.pkl']" ] }, "execution_count": 254, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import joblib\n", "# joblib.dump(model, 'DeepLrng_0.57Bal_0.72Acc_0Imbalance.pkl')" ] }, { "cell_type": "code", "execution_count": 255, "id": "1d669a5c", "metadata": {}, "outputs": [], "source": [ "# balAcc_deepL = balanced_accuracy_score(y_test, y_pred)" ] }, { "cell_type": "markdown", "id": "c5150641", "metadata": {}, "source": [ "### XGBoost" ] }, { "cell_type": "code", "execution_count": 256, "id": "6fd838a0", "metadata": {}, "outputs": [], "source": [ "from xgboost import XGBClassifier\n", "import matplotlib.pyplot as pyplot" ] }, { "cell_type": "code", "execution_count": 257, "id": "526ab347", "metadata": {}, "outputs": [], "source": [ "evalset = [(X_train, y_train), (X_test, y_test)]" ] }, { "cell_type": "code", "execution_count": 272, "id": "7a88db2e", "metadata": {}, "outputs": [], "source": [ "model = XGBClassifier(min_child_weight=1, max_depth=10, learning_rate=0.05, gamma=0.1, colsample_bytree=0.4, booster='gbtree')" ] }, { "cell_type": "code", "execution_count": null, "id": "1d24b4fb", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 273, "id": "dba3a6e3", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0]\tvalidation_0-logloss:0.67661\tvalidation_1-logloss:0.70328\n", "[1]\tvalidation_0-logloss:0.65521\tvalidation_1-logloss:0.69531\n", "[2]\tvalidation_0-logloss:0.63482\tvalidation_1-logloss:0.69321\n", "[3]\tvalidation_0-logloss:0.61969\tvalidation_1-logloss:0.68630\n", "[4]\tvalidation_0-logloss:0.59719\tvalidation_1-logloss:0.67694\n", "[5]\tvalidation_0-logloss:0.57770\tvalidation_1-logloss:0.66516\n", "[6]\tvalidation_0-logloss:0.55879\tvalidation_1-logloss:0.65911\n", "[7]\tvalidation_0-logloss:0.54115\tvalidation_1-logloss:0.65018\n", "[8]\tvalidation_0-logloss:0.52642\tvalidation_1-logloss:0.64157\n", "[9]\tvalidation_0-logloss:0.51256\tvalidation_1-logloss:0.63656\n", "[10]\tvalidation_0-logloss:0.49827\tvalidation_1-logloss:0.62891\n", "[11]\tvalidation_0-logloss:0.48187\tvalidation_1-logloss:0.62065\n", "[12]\tvalidation_0-logloss:0.47075\tvalidation_1-logloss:0.61621\n", "[13]\tvalidation_0-logloss:0.45760\tvalidation_1-logloss:0.60950\n", "[14]\tvalidation_0-logloss:0.44531\tvalidation_1-logloss:0.60429\n", "[15]\tvalidation_0-logloss:0.43407\tvalidation_1-logloss:0.60269\n", "[16]\tvalidation_0-logloss:0.42562\tvalidation_1-logloss:0.59791\n", "[17]\tvalidation_0-logloss:0.41697\tvalidation_1-logloss:0.59881\n", "[18]\tvalidation_0-logloss:0.40683\tvalidation_1-logloss:0.59550\n", "[19]\tvalidation_0-logloss:0.39827\tvalidation_1-logloss:0.59078\n", "[20]\tvalidation_0-logloss:0.39192\tvalidation_1-logloss:0.58710\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\USER\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\xgboost\\sklearn.py:885: UserWarning:\n", "\n", "`eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead.\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[21]\tvalidation_0-logloss:0.38236\tvalidation_1-logloss:0.58428\n", "[22]\tvalidation_0-logloss:0.37297\tvalidation_1-logloss:0.58231\n", "[23]\tvalidation_0-logloss:0.36465\tvalidation_1-logloss:0.57750\n", "[24]\tvalidation_0-logloss:0.35681\tvalidation_1-logloss:0.57194\n", "[25]\tvalidation_0-logloss:0.34760\tvalidation_1-logloss:0.56954\n", "[26]\tvalidation_0-logloss:0.34147\tvalidation_1-logloss:0.56553\n", "[27]\tvalidation_0-logloss:0.33414\tvalidation_1-logloss:0.56233\n", "[28]\tvalidation_0-logloss:0.32687\tvalidation_1-logloss:0.56031\n", "[29]\tvalidation_0-logloss:0.32007\tvalidation_1-logloss:0.55773\n", "[30]\tvalidation_0-logloss:0.31395\tvalidation_1-logloss:0.55747\n", "[31]\tvalidation_0-logloss:0.30720\tvalidation_1-logloss:0.55553\n", "[32]\tvalidation_0-logloss:0.30214\tvalidation_1-logloss:0.55556\n", "[33]\tvalidation_0-logloss:0.29754\tvalidation_1-logloss:0.55539\n", "[34]\tvalidation_0-logloss:0.29113\tvalidation_1-logloss:0.55743\n", "[35]\tvalidation_0-logloss:0.28578\tvalidation_1-logloss:0.55632\n", "[36]\tvalidation_0-logloss:0.28070\tvalidation_1-logloss:0.55425\n", "[37]\tvalidation_0-logloss:0.27574\tvalidation_1-logloss:0.55206\n", "[38]\tvalidation_0-logloss:0.27079\tvalidation_1-logloss:0.55134\n", "[39]\tvalidation_0-logloss:0.26558\tvalidation_1-logloss:0.55058\n", "[40]\tvalidation_0-logloss:0.26029\tvalidation_1-logloss:0.54862\n", "[41]\tvalidation_0-logloss:0.25407\tvalidation_1-logloss:0.54772\n", "[42]\tvalidation_0-logloss:0.24963\tvalidation_1-logloss:0.54629\n", "[43]\tvalidation_0-logloss:0.24462\tvalidation_1-logloss:0.54600\n", "[44]\tvalidation_0-logloss:0.23989\tvalidation_1-logloss:0.54402\n", "[45]\tvalidation_0-logloss:0.23602\tvalidation_1-logloss:0.54339\n", "[46]\tvalidation_0-logloss:0.23144\tvalidation_1-logloss:0.54039\n", "[47]\tvalidation_0-logloss:0.22633\tvalidation_1-logloss:0.54134\n", "[48]\tvalidation_0-logloss:0.22224\tvalidation_1-logloss:0.54023\n", "[49]\tvalidation_0-logloss:0.21808\tvalidation_1-logloss:0.53932\n", "[50]\tvalidation_0-logloss:0.21443\tvalidation_1-logloss:0.53857\n", "[51]\tvalidation_0-logloss:0.21089\tvalidation_1-logloss:0.53861\n", "[52]\tvalidation_0-logloss:0.20781\tvalidation_1-logloss:0.53834\n", "[53]\tvalidation_0-logloss:0.20434\tvalidation_1-logloss:0.53884\n", "[54]\tvalidation_0-logloss:0.20090\tvalidation_1-logloss:0.53712\n", "[55]\tvalidation_0-logloss:0.19820\tvalidation_1-logloss:0.53733\n", "[56]\tvalidation_0-logloss:0.19497\tvalidation_1-logloss:0.53574\n", "[57]\tvalidation_0-logloss:0.19180\tvalidation_1-logloss:0.53556\n", "[58]\tvalidation_0-logloss:0.18955\tvalidation_1-logloss:0.53448\n", "[59]\tvalidation_0-logloss:0.18650\tvalidation_1-logloss:0.53664\n", "[60]\tvalidation_0-logloss:0.18340\tvalidation_1-logloss:0.53739\n", "[61]\tvalidation_0-logloss:0.17981\tvalidation_1-logloss:0.53735\n", "[62]\tvalidation_0-logloss:0.17627\tvalidation_1-logloss:0.53687\n", "[63]\tvalidation_0-logloss:0.17385\tvalidation_1-logloss:0.53707\n", "[64]\tvalidation_0-logloss:0.17148\tvalidation_1-logloss:0.53633\n", "[65]\tvalidation_0-logloss:0.16960\tvalidation_1-logloss:0.53696\n", "[66]\tvalidation_0-logloss:0.16749\tvalidation_1-logloss:0.53501\n", "[67]\tvalidation_0-logloss:0.16505\tvalidation_1-logloss:0.53411\n", "[68]\tvalidation_0-logloss:0.16263\tvalidation_1-logloss:0.53457\n", "[69]\tvalidation_0-logloss:0.16023\tvalidation_1-logloss:0.53434\n", "[70]\tvalidation_0-logloss:0.15778\tvalidation_1-logloss:0.53550\n", "[71]\tvalidation_0-logloss:0.15544\tvalidation_1-logloss:0.53587\n", "[72]\tvalidation_0-logloss:0.15348\tvalidation_1-logloss:0.53702\n", "[73]\tvalidation_0-logloss:0.15119\tvalidation_1-logloss:0.53632\n", "[74]\tvalidation_0-logloss:0.14912\tvalidation_1-logloss:0.53775\n", "[75]\tvalidation_0-logloss:0.14715\tvalidation_1-logloss:0.53862\n", "[76]\tvalidation_0-logloss:0.14537\tvalidation_1-logloss:0.53827\n", "[77]\tvalidation_0-logloss:0.14367\tvalidation_1-logloss:0.53889\n", "[78]\tvalidation_0-logloss:0.14195\tvalidation_1-logloss:0.53773\n", "[79]\tvalidation_0-logloss:0.14046\tvalidation_1-logloss:0.53854\n", "[80]\tvalidation_0-logloss:0.13853\tvalidation_1-logloss:0.53992\n", "[81]\tvalidation_0-logloss:0.13685\tvalidation_1-logloss:0.54047\n", "[82]\tvalidation_0-logloss:0.13494\tvalidation_1-logloss:0.53971\n", "[83]\tvalidation_0-logloss:0.13361\tvalidation_1-logloss:0.54048\n", "[84]\tvalidation_0-logloss:0.13218\tvalidation_1-logloss:0.54120\n", "[85]\tvalidation_0-logloss:0.13043\tvalidation_1-logloss:0.54178\n", "[86]\tvalidation_0-logloss:0.12883\tvalidation_1-logloss:0.54060\n", "[87]\tvalidation_0-logloss:0.12764\tvalidation_1-logloss:0.54163\n", "[88]\tvalidation_0-logloss:0.12647\tvalidation_1-logloss:0.54080\n", "[89]\tvalidation_0-logloss:0.12505\tvalidation_1-logloss:0.54188\n", "[90]\tvalidation_0-logloss:0.12349\tvalidation_1-logloss:0.54293\n", "[91]\tvalidation_0-logloss:0.12207\tvalidation_1-logloss:0.54180\n", "[92]\tvalidation_0-logloss:0.12069\tvalidation_1-logloss:0.54237\n", "[93]\tvalidation_0-logloss:0.11904\tvalidation_1-logloss:0.54333\n", "[94]\tvalidation_0-logloss:0.11808\tvalidation_1-logloss:0.54402\n", "[95]\tvalidation_0-logloss:0.11668\tvalidation_1-logloss:0.54486\n", "[96]\tvalidation_0-logloss:0.11538\tvalidation_1-logloss:0.54451\n", "[97]\tvalidation_0-logloss:0.11415\tvalidation_1-logloss:0.54635\n", "[98]\tvalidation_0-logloss:0.11297\tvalidation_1-logloss:0.54765\n", "[99]\tvalidation_0-logloss:0.11168\tvalidation_1-logloss:0.54912\n" ] }, { "data": { "text/html": [ "
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,\n",
       "              colsample_bylevel=None, colsample_bynode=None,\n",
       "              colsample_bytree=0.4, device=None, early_stopping_rounds=None,\n",
       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
       "              gamma=0.1, grow_policy=None, importance_type=None,\n",
       "              interaction_constraints=None, learning_rate=0.05, max_bin=None,\n",
       "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
       "              max_delta_step=None, max_depth=10, max_leaves=None,\n",
       "              min_child_weight=1, missing=nan, monotone_constraints=None,\n",
       "              multi_strategy=None, n_estimators=None, n_jobs=None,\n",
       "              num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "XGBClassifier(base_score=None, booster='gbtree', callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=0.4, device=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=0.1, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=0.05, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=10, max_leaves=None,\n", " min_child_weight=1, missing=nan, monotone_constraints=None,\n", " multi_strategy=None, n_estimators=None, n_jobs=None,\n", " num_parallel_tree=None, random_state=None, ...)" ] }, "execution_count": 273, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(X_train, y_train, eval_metric='logloss', eval_set=evalset)" ] }, { "cell_type": "code", "execution_count": 274, "id": "c4259ffe", "metadata": {}, "outputs": [], "source": [ "results = model.evals_result()" ] }, { "cell_type": "code", "execution_count": 275, "id": "16e42c9c", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# plot learning curves\n", "\n", "pyplot.plot(results['validation_0']['logloss'], label='train')\n", "pyplot.plot(results['validation_1']['logloss'], label='test')\n", "# show the legend\n", "pyplot.legend()\n", "# show the plot\n", "pyplot.show()" ] }, { "cell_type": "code", "execution_count": 276, "id": "43313de6", "metadata": {}, "outputs": [], "source": [ "y_pred = model.predict(X_test)\n", "# since XGBoost outputs probabilities insted of binary values we have\n", "# to round them to the nearest\n", "y_pred = [round(value) for value in y_pred] \n" ] }, { "cell_type": "code", "execution_count": 277, "id": "8a841d6d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.82 0.84 0.83 95\n", " 1 0.35 0.32 0.33 25\n", "\n", " accuracy 0.73 120\n", " macro avg 0.59 0.58 0.58 120\n", "weighted avg 0.73 0.73 0.73 120\n", "\n", "[[80 15]\n", " [17 8]]\n" ] } ], "source": [ "from sklearn.metrics import classification_report, confusion_matrix\n", "\n", "# Print classification report and confusion matrix\n", "print(classification_report(y_test, y_pred))\n", "print(confusion_matrix(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 278, "id": "d794033a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5810526315789474\n" ] } ], "source": [ "from sklearn.metrics import balanced_accuracy_score\n", "print(balanced_accuracy_score(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 279, "id": "312380b7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['XGBoost_0.73_0.58Bal_0Imbalance_final.pkl']" ] }, "execution_count": 279, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import joblib\n", "# joblib.dump(model, 'XGBoost_0.73_0.58Bal_0Imbalance_final.pkl')" ] }, { "cell_type": "code", "execution_count": 280, "id": "26725c9b", "metadata": {}, "outputs": [], "source": [ "# balAcc_XGB = balanced_accuracy_score(y_test, y_pred)" ] }, { "cell_type": "code", "execution_count": 268, "id": "ea0f5e05", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV\n", "from sklearn.metrics import make_scorer, balanced_accuracy_score" ] }, { "cell_type": "code", "execution_count": null, "id": "31cc1391", "metadata": {}, "outputs": [], "source": [ "# bl_scorer = make_scorer(balanced_accuracy_score)\n", "\n", "# clf = GridSearchCV(XGBClassifier(), {\n", "# 'booster': ['gblinear', 'gbtree', 'dart'],\n", "# 'max_depth': [5, 6, 7, 8, 9, 10]\n", "# }, cv=3, return_train_score = False, scoring=bl_scorer)\n", "\n", "# clf.fit(X_train, y_train)\n", "# clf_results = pd.DataFrame(clf.cv_results_)\n", "# clf_results" ] }, { "cell_type": "code", "execution_count": 269, "id": "aab2e4d6", "metadata": {}, "outputs": [], "source": [ "params = {\n", " 'learning_rate' :[ 0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ],\n", " 'max_depth' :[3,4,5,6,8,10,12,15],\n", " 'min_child_weight' :[1,3,5,7],\n", " 'gamma' : [0.0, 0.1, 0.2, 0.3, 0.4],\n", " 'colsample_bytree' :[0.3,0.4,0.5,0.7],\n", " 'booster': ['gbtree', 'dart']\n", "}" ] }, { "cell_type": "code", "execution_count": 270, "id": "2612239f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n" ] }, { "data": { "text/html": [ "
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,\n",
       "              colsample_bylevel=None, colsample_bynode=None,\n",
       "              colsample_bytree=0.4, device=None, early_stopping_rounds=None,\n",
       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
       "              gamma=0.1, grow_policy=None, importance_type=None,\n",
       "              interaction_constraints=None, learning_rate=0.05, max_bin=None,\n",
       "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
       "              max_delta_step=None, max_depth=10, max_leaves=None,\n",
       "              min_child_weight=1, missing=nan, monotone_constraints=None,\n",
       "              multi_strategy=None, n_estimators=None, n_jobs=None,\n",
       "              num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "XGBClassifier(base_score=None, booster='gbtree', callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=0.4, device=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " gamma=0.1, grow_policy=None, importance_type=None,\n", " interaction_constraints=None, learning_rate=0.05, max_bin=None,\n", " max_cat_threshold=None, max_cat_to_onehot=None,\n", " max_delta_step=None, max_depth=10, max_leaves=None,\n", " min_child_weight=1, missing=nan, monotone_constraints=None,\n", " multi_strategy=None, n_estimators=None, n_jobs=None,\n", " num_parallel_tree=None, random_state=None, ...)" ] }, "execution_count": 270, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import RandomizedSearchCV\n", "rmCV = RandomizedSearchCV(XGBClassifier(), param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs =-1, cv=3, verbose=3)\n", "rmCV.fit(X_train, y_train)\n", "rmCV.best_estimator_" ] }, { "cell_type": "code", "execution_count": 271, "id": "2829402b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'min_child_weight': 1,\n", " 'max_depth': 10,\n", " 'learning_rate': 0.05,\n", " 'gamma': 0.1,\n", " 'colsample_bytree': 0.4,\n", " 'booster': 'gbtree'}" ] }, "execution_count": 271, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rmCV.best_params_" ] }, { "cell_type": "code", "execution_count": null, "id": "f4d39c94", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "7de4de33", "metadata": {}, "source": [ "### LightGBM" ] }, { "cell_type": "code", "execution_count": 281, "id": "8e3b9cbd", "metadata": {}, "outputs": [], "source": [ "from lightgbm import LGBMClassifier" ] }, { "cell_type": "code", "execution_count": 282, "id": "ed6f29fd", "metadata": {}, "outputs": [], "source": [ "model = LGBMClassifier()" ] }, { "cell_type": "code", "execution_count": 283, "id": "36d16ee3", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\USER\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\sklearn\\preprocessing\\_label.py:99: DataConversionWarning:\n", "\n", "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", "\n", "C:\\Users\\USER\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\sklearn\\preprocessing\\_label.py:134: DataConversionWarning:\n", "\n", "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[LightGBM] [Info] Number of positive: 231, number of negative: 221\n", "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000557 seconds.\n", "You can set `force_row_wise=true` to remove the overhead.\n", "And if memory is not enough, you can set `force_col_wise=true`.\n", "[LightGBM] [Info] Total Bins 1080\n", "[LightGBM] [Info] Number of data points in the train set: 452, number of used features: 15\n", "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.511062 -> initscore=0.044255\n", "[LightGBM] [Info] Start training from score 0.044255\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n" ] }, { "data": { "text/html": [ "
LGBMClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LGBMClassifier()" ] }, "execution_count": 283, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 284, "id": "9a2674d7", "metadata": {}, "outputs": [], "source": [ "y_pred = model.predict(X_test)\n", "# since LightGBM outputs probabilities insted of binary values we have\n", "# to round them to the nearest\n", "y_pred = [round(value) for value in y_pred] " ] }, { "cell_type": "code", "execution_count": 285, "id": "b4a0b802", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.82 0.84 0.83 95\n", " 1 0.35 0.32 0.33 25\n", "\n", " accuracy 0.73 120\n", " macro avg 0.59 0.58 0.58 120\n", "weighted avg 0.73 0.73 0.73 120\n", "\n", "[[80 15]\n", " [17 8]]\n" ] } ], "source": [ "from sklearn.metrics import classification_report, confusion_matrix\n", "\n", "# Print classification report and confusion matrix\n", "print(classification_report(y_test, y_pred))\n", "print(confusion_matrix(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 286, "id": "e3d411a4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5810526315789474\n" ] } ], "source": [ "from sklearn.metrics import balanced_accuracy_score\n", "print(balanced_accuracy_score(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 287, "id": "25e15b80", "metadata": {}, "outputs": [], "source": [ "# balAcc_LGBM = balanced_accuracy_score(y_test, y_pred)" ] }, { "cell_type": "markdown", "id": "5dcb7f85", "metadata": {}, "source": [ "### Logistic Regression" ] }, { "cell_type": "code", "execution_count": 288, "id": "c669d69a", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import KFold" ] }, { "cell_type": "code", "execution_count": 289, "id": "42f9bc94", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LogisticRegression(max_iter=500)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "LogisticRegression(max_iter=500)" ] }, "execution_count": 289, "metadata": {}, "output_type": "execute_result" } ], "source": [ "logReg = LogisticRegression(max_iter=500, tol=0.0001)\n", "logReg.fit(X_train, y_train.values.ravel())" ] }, { "cell_type": "code", "execution_count": 290, "id": "445ca097", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,\n", " 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,\n", " 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int64)" ] }, "execution_count": 290, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred=logReg.predict(X_test)\n", "y_pred" ] }, { "cell_type": "code", "execution_count": 291, "id": "771ca9f1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[77, 18],\n", " [18, 7]], dtype=int64)" ] }, "execution_count": 291, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import metrics\n", "from sklearn.metrics import classification_report, confusion_matrix\n", "cnf_matrix = metrics.confusion_matrix(y_test, y_pred)\n", "cnf_matrix" ] }, { "cell_type": "code", "execution_count": 292, "id": "da36200e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.81 0.81 0.81 95\n", " 1 0.28 0.28 0.28 25\n", "\n", " accuracy 0.70 120\n", " macro avg 0.55 0.55 0.55 120\n", "weighted avg 0.70 0.70 0.70 120\n", "\n" ] } ], "source": [ "print(classification_report(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 293, "id": "ecb51543", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5452631578947369\n" ] } ], "source": [ "from sklearn.metrics import balanced_accuracy_score\n", "print(balanced_accuracy_score(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 294, "id": "a662c876", "metadata": {}, "outputs": [], "source": [ "# balAcc_LogReg = balanced_accuracy_score(y_test, y_pred)" ] }, { "cell_type": "markdown", "id": "275688a7", "metadata": {}, "source": [ "### SVC" ] }, { "cell_type": "code", "execution_count": 295, "id": "b02b9f4c", "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", "from sklearn.metrics import classification_report, confusion_matrix" ] }, { "cell_type": "code", "execution_count": 308, "id": "93c0eddb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
SVC(gamma=0.1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "SVC(gamma=0.1)" ] }, "execution_count": 308, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = SVC(kernel = 'rbf', gamma=0.1, C=1.0)\n", "model.fit(X_train, y_train.values.ravel())" ] }, { "cell_type": "code", "execution_count": 309, "id": "f555f2f0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.82 0.87 0.85 95\n", " 1 0.37 0.28 0.32 25\n", "\n", " accuracy 0.75 120\n", " macro avg 0.60 0.58 0.58 120\n", "weighted avg 0.73 0.75 0.74 120\n", "\n" ] } ], "source": [ "y_pred = model.predict(X_test)\n", "print(classification_report(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 310, "id": "054e912a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[83, 12],\n", " [18, 7]], dtype=int64)" ] }, "execution_count": 310, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cnf_matrix = confusion_matrix(y_test, y_pred)\n", "cnf_matrix" ] }, { "cell_type": "code", "execution_count": 311, "id": "aa6362cb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5768421052631579\n" ] } ], "source": [ "from sklearn.metrics import balanced_accuracy_score\n", "print(balanced_accuracy_score(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 312, "id": "1f367280", "metadata": {}, "outputs": [], "source": [ "# clf = GridSearchCV(SVC(), {\n", "# 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],\n", "# 'C': [0.1, 1, 10, 50]\n", "# }, cv=3, return_train_score = False)\n", "\n", "# clf.fit(X_ada, Y_ada)\n", "# clf_results = pd.DataFrame(clf.cv_results_)\n", "# clf_results" ] }, { "cell_type": "code", "execution_count": 313, "id": "e973a1ce", "metadata": {}, "outputs": [], "source": [ "params = {\n", " 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],\n", " 'C': np.logspace(-3, 2, 6),\n", " 'gamma' : np.logspace(-3, 2, 6)\n", "}" ] }, { "cell_type": "code", "execution_count": 306, "id": "85bc8d41", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 3 folds for each of 5 candidates, totalling 15 fits\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\USER\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\sklearn\\utils\\validation.py:1143: DataConversionWarning:\n", "\n", "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", "\n" ] }, { "data": { "text/html": [ "
SVC(gamma=0.1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "SVC(gamma=0.1)" ] }, "execution_count": 306, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import RandomizedSearchCV\n", "rmCV = RandomizedSearchCV(SVC(), param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs =-1, cv=3, verbose=3)\n", "rmCV.fit(X_train, y_train)\n", "rmCV.best_estimator_" ] }, { "cell_type": "code", "execution_count": 307, "id": "fb2d74c8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'kernel': 'rbf', 'gamma': 0.1, 'C': 1.0}" ] }, "execution_count": 307, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rmCV.best_params_" ] }, { "cell_type": "code", "execution_count": 314, "id": "b8c2ae90", "metadata": {}, "outputs": [], "source": [ "# balAcc_SVC = balanced_accuracy_score(y_test, y_pred)" ] }, { "cell_type": "code", "execution_count": null, "id": "40fa50ce", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 327, "id": "8b513ea4", "metadata": {}, "outputs": [], "source": [ "import math\n", "acc_val = [balAcc_deepL, balAcc_LGBM, balAcc_LogReg, balAcc_SVC, balAcc_XGB]\n", "rounded_acc_val = []\n", "\n", "for var in acc_val:\n", " rounded_var = round(var, 3)\n", " rounded_acc_val.append(rounded_var)" ] }, { "cell_type": "code", "execution_count": 329, "id": "79520186", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "models = ['NeuralNet', 'LightGBM', 'LogReg', 'SVC', 'XGBoost']\n", "\n", "# Define colors for each bar\n", "colors = ['cyan', 'green', 'orange', 'red', 'purple']\n", "\n", "# Create a figure and axis for bar plot\n", "fig, ax = plt.subplots(figsize=(8, 6))\n", "\n", "# Plot the bar plot\n", "bars = ax.bar(models, rounded_acc_val, color=colors)\n", "\n", "# Add value labels to each bar\n", "for bar in bars:\n", " height = bar.get_height()\n", " ax.annotate(f'{height}', xy=(bar.get_x() + bar.get_width() / 2, height),\n", " xytext=(0, 3), textcoords=\"offset points\",\n", " ha='center', va='bottom')\n", "\n", "ax.set_ylabel('Balanced Accuracy')\n", "ax.set_title('Balanced Accuracy Score Comparison')\n", "\n", "# Show the plot\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "1744cc36", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }