2413 lines (2412 with data), 426.6 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dd2d8358",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd \n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import sys\n",
"\n",
"if not sys.warnoptions:\n",
" import warnings\n",
" warnings.simplefilter(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "071adfe5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sample_id</th>\n",
" <th>patient_cohort</th>\n",
" <th>sample_origin</th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>diagnosis</th>\n",
" <th>stage</th>\n",
" <th>benign_sample_diagnosis</th>\n",
" <th>plasma_CA19_9</th>\n",
" <th>creatinine</th>\n",
" <th>LYVE1</th>\n",
" <th>REG1B</th>\n",
" <th>TFF1</th>\n",
" <th>REG1A</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>S1</td>\n",
" <td>Cohort1</td>\n",
" <td>BPTB</td>\n",
" <td>33</td>\n",
" <td>F</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>11.7</td>\n",
" <td>1.83222</td>\n",
" <td>0.893219</td>\n",
" <td>52.94884</td>\n",
" <td>654.282174</td>\n",
" <td>1262.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>S10</td>\n",
" <td>Cohort1</td>\n",
" <td>BPTB</td>\n",
" <td>81</td>\n",
" <td>F</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.97266</td>\n",
" <td>2.037585</td>\n",
" <td>94.46703</td>\n",
" <td>209.488250</td>\n",
" <td>228.407</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>S100</td>\n",
" <td>Cohort2</td>\n",
" <td>BPTB</td>\n",
" <td>51</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7.0</td>\n",
" <td>0.78039</td>\n",
" <td>0.145589</td>\n",
" <td>102.36600</td>\n",
" <td>461.141000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>S101</td>\n",
" <td>Cohort2</td>\n",
" <td>BPTB</td>\n",
" <td>61</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>8.0</td>\n",
" <td>0.70122</td>\n",
" <td>0.002805</td>\n",
" <td>60.57900</td>\n",
" <td>142.950000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>S102</td>\n",
" <td>Cohort2</td>\n",
" <td>BPTB</td>\n",
" <td>62</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9.0</td>\n",
" <td>0.21489</td>\n",
" <td>0.000860</td>\n",
" <td>65.54000</td>\n",
" <td>41.088000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sample_id patient_cohort sample_origin age sex diagnosis stage \\\n",
"0 S1 Cohort1 BPTB 33 F 1 NaN \n",
"1 S10 Cohort1 BPTB 81 F 1 NaN \n",
"2 S100 Cohort2 BPTB 51 M 1 NaN \n",
"3 S101 Cohort2 BPTB 61 M 1 NaN \n",
"4 S102 Cohort2 BPTB 62 M 1 NaN \n",
"\n",
" benign_sample_diagnosis plasma_CA19_9 creatinine LYVE1 REG1B \\\n",
"0 NaN 11.7 1.83222 0.893219 52.94884 \n",
"1 NaN NaN 0.97266 2.037585 94.46703 \n",
"2 NaN 7.0 0.78039 0.145589 102.36600 \n",
"3 NaN 8.0 0.70122 0.002805 60.57900 \n",
"4 NaN 9.0 0.21489 0.000860 65.54000 \n",
"\n",
" TFF1 REG1A \n",
"0 654.282174 1262.000 \n",
"1 209.488250 228.407 \n",
"2 461.141000 NaN \n",
"3 142.950000 NaN \n",
"4 41.088000 NaN "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('Debernardi et al 2020 data.csv')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7aea3f5f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 590 entries, 0 to 589\n",
"Data columns (total 14 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 sample_id 590 non-null object \n",
" 1 patient_cohort 590 non-null object \n",
" 2 sample_origin 590 non-null object \n",
" 3 age 590 non-null int64 \n",
" 4 sex 590 non-null object \n",
" 5 diagnosis 590 non-null int64 \n",
" 6 stage 199 non-null object \n",
" 7 benign_sample_diagnosis 208 non-null object \n",
" 8 plasma_CA19_9 350 non-null float64\n",
" 9 creatinine 590 non-null float64\n",
" 10 LYVE1 590 non-null float64\n",
" 11 REG1B 590 non-null float64\n",
" 12 TFF1 590 non-null float64\n",
" 13 REG1A 306 non-null float64\n",
"dtypes: float64(6), int64(2), object(6)\n",
"memory usage: 64.7+ KB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "327fbab1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"sample_id 0\n",
"patient_cohort 0\n",
"sample_origin 0\n",
"age 0\n",
"sex 0\n",
"diagnosis 0\n",
"stage 391\n",
"benign_sample_diagnosis 382\n",
"plasma_CA19_9 240\n",
"creatinine 0\n",
"LYVE1 0\n",
"REG1B 0\n",
"TFF1 0\n",
"REG1A 284\n",
"dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bce04c59",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2 208\n",
"3 199\n",
"1 183\n",
"Name: diagnosis, dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.diagnosis.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "de687416",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.boxplot(x='diagnosis', y='REG1A', data=df)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "68f14c8d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"diagnosis\n",
"1 79\n",
"2 87\n",
"3 140\n",
"Name: REG1A, dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('diagnosis').REG1A.count()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "ae720538",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.0\n",
"Name: REG1A, dtype: float64"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.REG1A.mode()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "fb9ea237",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"diagnosis\n",
"1 113.000\n",
"2 127.174\n",
"3 411.006\n",
"Name: REG1A, dtype: float64"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('diagnosis')['REG1A'].median()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "32afb3c4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>diagnosis</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>79.0</td>\n",
" <td>227.871886</td>\n",
" <td>288.272476</td>\n",
" <td>0.0</td>\n",
" <td>62.15350</td>\n",
" <td>113.000</td>\n",
" <td>285.60700</td>\n",
" <td>1617.142</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>87.0</td>\n",
" <td>547.458092</td>\n",
" <td>1210.159890</td>\n",
" <td>0.0</td>\n",
" <td>63.54550</td>\n",
" <td>127.174</td>\n",
" <td>405.43800</td>\n",
" <td>8083.492</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>140.0</td>\n",
" <td>1138.323721</td>\n",
" <td>1871.810807</td>\n",
" <td>0.0</td>\n",
" <td>152.33625</td>\n",
" <td>411.006</td>\n",
" <td>1435.60625</td>\n",
" <td>13200.000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% 50% \\\n",
"diagnosis \n",
"1 79.0 227.871886 288.272476 0.0 62.15350 113.000 \n",
"2 87.0 547.458092 1210.159890 0.0 63.54550 127.174 \n",
"3 140.0 1138.323721 1871.810807 0.0 152.33625 411.006 \n",
"\n",
" 75% max \n",
"diagnosis \n",
"1 285.60700 1617.142 \n",
"2 405.43800 8083.492 \n",
"3 1435.60625 13200.000 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('diagnosis').REG1A.describe()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "bb230276",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.boxplot(x='diagnosis', y='plasma_CA19_9', data=df)\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "3274f336",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>diagnosis</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>92.0</td>\n",
" <td>8.749569</td>\n",
" <td>12.189355</td>\n",
" <td>0.0</td>\n",
" <td>1.707332</td>\n",
" <td>5.334598</td>\n",
" <td>9.15</td>\n",
" <td>84.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>108.0</td>\n",
" <td>61.785741</td>\n",
" <td>235.816534</td>\n",
" <td>1.0</td>\n",
" <td>9.750000</td>\n",
" <td>17.000000</td>\n",
" <td>29.25</td>\n",
" <td>1913.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>150.0</td>\n",
" <td>1476.154733</td>\n",
" <td>3550.038158</td>\n",
" <td>0.6</td>\n",
" <td>99.700000</td>\n",
" <td>427.500000</td>\n",
" <td>1457.50</td>\n",
" <td>31000.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% 50% \\\n",
"diagnosis \n",
"1 92.0 8.749569 12.189355 0.0 1.707332 5.334598 \n",
"2 108.0 61.785741 235.816534 1.0 9.750000 17.000000 \n",
"3 150.0 1476.154733 3550.038158 0.6 99.700000 427.500000 \n",
"\n",
" 75% max \n",
"diagnosis \n",
"1 9.15 84.3 \n",
"2 29.25 1913.0 \n",
"3 1457.50 31000.0 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('diagnosis').plasma_CA19_9.describe()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2e3e811c",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(6,4))\n",
"sns.countplot(data=df, x='diagnosis')\n",
"\n",
"# Add labels and title\n",
"plt.xlabel('Diagnosis')\n",
"plt.ylabel('Count')\n",
"plt.title('Distribution by classes')\n",
"\n",
"# Show the plot\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "72603ac8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<seaborn.axisgrid.FacetGrid at 0x168b6dc40>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 404.625x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"sns.catplot(\n",
" data=df, x=\"diagnosis\", y=\"age\", hue=\"sex\",\n",
" palette={\"M\": \"g\", \"F\": \"m\"},\n",
" markers=[\"^\", \"o\"], linestyles=[\"-\", \"--\"],\n",
" kind=\"point\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "88330945",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"##### plt.figure(figsize=(6,4))\n",
"sns.countplot(data=df, x='diagnosis', hue='sex')\n",
"\n",
"# Add labels and title\n",
"plt.xlabel('Diagnosis')\n",
"plt.ylabel('Count')\n",
"plt.title('Gender Count in Each Classes')\n",
"\n",
"# Show the plot\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4efad607",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sex diagnosis\n",
"F 1 115\n",
" 2 101\n",
" 3 83\n",
"M 3 116\n",
" 2 107\n",
" 1 68\n",
"Name: diagnosis, dtype: int64\n"
]
}
],
"source": [
"gender = df.groupby(['sex'])['diagnosis'].value_counts()\n",
"print(gender)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "080cd406",
"metadata": {},
"outputs": [],
"source": [
"import plotly.express as px"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "49c8f02b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>diagnosis</th>\n",
" <th>plasma_CA19_9</th>\n",
" <th>creatinine</th>\n",
" <th>LYVE1</th>\n",
" <th>REG1B</th>\n",
" <th>TFF1</th>\n",
" <th>REG1A</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>33</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>11.700000</td>\n",
" <td>1.83222</td>\n",
" <td>0.893219</td>\n",
" <td>52.94884</td>\n",
" <td>654.282174</td>\n",
" <td>1262.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>81</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8.749569</td>\n",
" <td>0.97266</td>\n",
" <td>2.037585</td>\n",
" <td>94.46703</td>\n",
" <td>209.488250</td>\n",
" <td>228.407000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>51</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7.000000</td>\n",
" <td>0.78039</td>\n",
" <td>0.145589</td>\n",
" <td>102.36600</td>\n",
" <td>461.141000</td>\n",
" <td>227.871886</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>61</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>8.000000</td>\n",
" <td>0.70122</td>\n",
" <td>0.002805</td>\n",
" <td>60.57900</td>\n",
" <td>142.950000</td>\n",
" <td>227.871886</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>62</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9.000000</td>\n",
" <td>0.21489</td>\n",
" <td>0.000860</td>\n",
" <td>65.54000</td>\n",
" <td>41.088000</td>\n",
" <td>227.871886</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex diagnosis plasma_CA19_9 creatinine LYVE1 REG1B \\\n",
"0 33 0 1 11.700000 1.83222 0.893219 52.94884 \n",
"1 81 0 1 8.749569 0.97266 2.037585 94.46703 \n",
"2 51 1 1 7.000000 0.78039 0.145589 102.36600 \n",
"3 61 1 1 8.000000 0.70122 0.002805 60.57900 \n",
"4 62 1 1 9.000000 0.21489 0.000860 65.54000 \n",
"\n",
" TFF1 REG1A \n",
"0 654.282174 1262.000000 \n",
"1 209.488250 228.407000 \n",
"2 461.141000 227.871886 \n",
"3 142.950000 227.871886 \n",
"4 41.088000 227.871886 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['sex'] = df['sex'].astype('category')\n",
"df['sex'] = df['sex'].cat.codes\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "34a649f1",
"metadata": {},
"outputs": [],
"source": [
"#drop multiple columns by name\n",
"df.drop(['patient_cohort', \n",
" 'sample_origin',\n",
" 'stage', \n",
" 'benign_sample_diagnosis',\n",
" 'sample_id'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e2d3ad5a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"diagnosis\n",
"1 227.871886\n",
"2 547.458092\n",
"3 1138.323721\n",
"Name: REG1A, dtype: float64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('diagnosis')['REG1A'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "fbbf9275",
"metadata": {},
"outputs": [],
"source": [
"df['REG1A'] = df['REG1A'].fillna(df.groupby('diagnosis')['REG1A'].transform('mean'))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "7a3f9ff2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"diagnosis\n",
"1 227.871886\n",
"2 547.458092\n",
"3 1138.323721\n",
"Name: REG1A, dtype: float64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('diagnosis')['REG1A'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "689eb922",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"diagnosis\n",
"1 8.749569\n",
"2 61.785741\n",
"3 1476.154733\n",
"Name: plasma_CA19_9, dtype: float64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby('diagnosis')['plasma_CA19_9'].mean()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "acd74365",
"metadata": {},
"outputs": [],
"source": [
"df['plasma_CA19_9'] = df['plasma_CA19_9'].fillna(df.groupby('diagnosis')['plasma_CA19_9'].transform('mean'))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "b00a59ec",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>diagnosis</th>\n",
" <th>plasma_CA19_9</th>\n",
" <th>creatinine</th>\n",
" <th>LYVE1</th>\n",
" <th>REG1B</th>\n",
" <th>TFF1</th>\n",
" <th>REG1A</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>33</td>\n",
" <td>F</td>\n",
" <td>1</td>\n",
" <td>11.700000</td>\n",
" <td>1.83222</td>\n",
" <td>0.893219</td>\n",
" <td>52.94884</td>\n",
" <td>654.282174</td>\n",
" <td>1262.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>81</td>\n",
" <td>F</td>\n",
" <td>1</td>\n",
" <td>8.749569</td>\n",
" <td>0.97266</td>\n",
" <td>2.037585</td>\n",
" <td>94.46703</td>\n",
" <td>209.488250</td>\n",
" <td>228.407000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>51</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>7.000000</td>\n",
" <td>0.78039</td>\n",
" <td>0.145589</td>\n",
" <td>102.36600</td>\n",
" <td>461.141000</td>\n",
" <td>227.871886</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>61</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>8.000000</td>\n",
" <td>0.70122</td>\n",
" <td>0.002805</td>\n",
" <td>60.57900</td>\n",
" <td>142.950000</td>\n",
" <td>227.871886</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>62</td>\n",
" <td>M</td>\n",
" <td>1</td>\n",
" <td>9.000000</td>\n",
" <td>0.21489</td>\n",
" <td>0.000860</td>\n",
" <td>65.54000</td>\n",
" <td>41.088000</td>\n",
" <td>227.871886</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex diagnosis plasma_CA19_9 creatinine LYVE1 REG1B \\\n",
"0 33 F 1 11.700000 1.83222 0.893219 52.94884 \n",
"1 81 F 1 8.749569 0.97266 2.037585 94.46703 \n",
"2 51 M 1 7.000000 0.78039 0.145589 102.36600 \n",
"3 61 M 1 8.000000 0.70122 0.002805 60.57900 \n",
"4 62 M 1 9.000000 0.21489 0.000860 65.54000 \n",
"\n",
" TFF1 REG1A \n",
"0 654.282174 1262.000000 \n",
"1 209.488250 228.407000 \n",
"2 461.141000 227.871886 \n",
"3 142.950000 227.871886 \n",
"4 41.088000 227.871886 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8fa1359",
"metadata": {},
"outputs": [],
"source": [
"ax = sns.heatmap(df, annot=True)\n",
"ax.set(xlabel=\"\", ylabel=\"\")\n",
"ax.xaxis.tick_top()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "75cc9c4b",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1080x720 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(15,10))\n",
"mask=np.triu(df.corr())\n",
"sns.heatmap(df.corr(),mask=mask,annot=True)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "de393007",
"metadata": {},
"outputs": [],
"source": [
"x = df.drop('diagnosis', axis=1)\n",
"y = df['diagnosis']"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "09758975",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(590, 8)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=42)\n",
"x.shape"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "f81e6785",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(472, 8) (118, 8)\n"
]
}
],
"source": [
"print(x_train.shape,x_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "23965bc4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(472,) (118,)\n"
]
}
],
"source": [
"print(y_train.shape,y_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "ec12f24d",
"metadata": {},
"outputs": [],
"source": [
"y_train = np.array(y_train) - 1\n",
"y_test = np.array(y_test) - 1"
]
},
{
"cell_type": "markdown",
"id": "8d95dffb",
"metadata": {},
"source": [
"# Logistic regression"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "5baa9c4b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cross-validation scores: [0.82105263 0.81052632 0.82978723 0.71276596 0.81914894]\n",
"Average cross-validation score: 0.7986562150055991\n",
" precision recall f1-score support\n",
"\n",
" 0 0.82 0.98 0.89 41\n",
" 1 0.92 0.56 0.70 39\n",
" 2 0.80 0.95 0.87 38\n",
"\n",
" accuracy 0.83 118\n",
" macro avg 0.84 0.83 0.82 118\n",
"weighted avg 0.84 0.83 0.82 118\n",
"\n"
]
}
],
"source": [
"from sklearn.datasets import load_iris\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import train_test_split, cross_val_score\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import confusion_matrix, classification_report\n",
"log_reg = LogisticRegression(multi_class='ovr', solver='liblinear')\n",
"\n",
"# Perform cross-validation on the training set\n",
"cv_scores = cross_val_score(log_reg, x_train, y_train, cv=5)\n",
"\n",
"print(\"Cross-validation scores: \", cv_scores)\n",
"print(\"Average cross-validation score: \", cv_scores.mean())\n",
"\n",
"# Train and evaluate the model with the test set\n",
"log_reg.fit(x_train, y_train)\n",
"y_pred_log = log_reg.predict(x_test)\n",
"\n",
"print(classification_report(y_test, y_pred_log))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "8a59f018",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"import xgboost as xgb\n",
"from sklearn.metrics import roc_curve, auc\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Predict the class probabilities of the testing data\n",
"y_prob = log_reg.predict_proba(x_test)\n",
"\n",
"# Compute the ROC curve and AUC for each class\n",
"fpr = {}\n",
"tpr = {}\n",
"roc_auc = {}\n",
"for i in range(3):\n",
" fpr[i], tpr[i], _ = roc_curve(y_test == i, y_prob[:, i])\n",
" roc_auc[i] = auc(fpr[i], tpr[i])\n",
"\n",
"# Plot the ROC curves and AUCs for each class\n",
"plt.figure()\n",
"lw = 2\n",
"colors = ['red', 'blue', 'green']\n",
"for i, color in zip(range(3), colors):\n",
" plt.plot(fpr[i], tpr[i], color=color, lw=lw,\n",
" label='ROC curve of class {0} (area = {1:0.2f})'\n",
" ''.format(i, roc_auc[i]))\n",
" plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')\n",
" plt.xlim([0.0, 1.0])\n",
" plt.ylim([0.0, 1.05])\n",
" plt.xlabel('False Positive Rate')\n",
" plt.ylabel('True Positive Rate')\n",
" plt.title('ROC curves and AUCs for each class')\n",
" plt.legend(loc=\"lower right\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "24ddd7dc",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x432 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"cm = confusion_matrix(y_test, y_pred_log)\n",
"\n",
"# Define the labels for the confusion matrix\n",
"classes = ['Healthy', 'Non-cancerous', 'Pancreatic cancer']\n",
"\n",
"# Plot the confusion matrix\n",
"plt.figure(figsize=(6, 6))\n",
"plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)\n",
"plt.title('Confusion matrix')\n",
"plt.colorbar()\n",
"tick_marks = np.arange(len(classes))\n",
"plt.xticks(tick_marks, classes, rotation=45)\n",
"plt.yticks(tick_marks, classes)\n",
"fmt = '.2f'\n",
"thresh = cm.max() / 2.\n",
"for i, j in np.ndindex(cm.shape):\n",
" plt.text(j, i, format(cm[i, j], fmt),\n",
" horizontalalignment=\"center\",\n",
" color=\"white\" if cm[i, j] > thresh else \"black\")\n",
"plt.xlabel('Predicted label')\n",
"plt.ylabel('True label')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "3a9adfe4",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 720x432 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Get the feature coefficients\n",
"feature_importance = np.abs(log_reg.coef_[0])\n",
"\n",
"# Sort the feature importance in descending order\n",
"sorted_indices = np.argsort(feature_importance)[1::]\n",
"sorted_importance = feature_importance[sorted_indices]\n",
"\n",
"# Get the feature names from your dataset (assuming they are stored in a list called 'feature_names')\n",
"sorted_feature_names = [df.columns[i] for i in sorted_indices]\n",
"\n",
"# Plot the feature importance\n",
"plt.figure(figsize=(10, 6))\n",
"plt.barh(range(len(sorted_importance)), sorted_importance, align='center')\n",
"plt.yticks(range(len(sorted_importance)), sorted_feature_names)\n",
"plt.xlabel('Feature Importance')\n",
"plt.title('Logistic Regression - Feature Importance')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "eb25ef16",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature '['TFF1', 'LYVE1', 'age', 'creatinine', 'diagnosis', 'sex', 'plasma_CA19_9']': Importance score [0.00365786 0.00374688 0.01836726 0.02765692 0.10125162 0.30531965\n",
" 1.03473583]\n"
]
}
],
"source": [
"print(f\"Feature '{sorted_feature_names}': Importance score {sorted_importance}\")"
]
},
{
"cell_type": "markdown",
"id": "736264ff",
"metadata": {},
"source": [
"# SVM"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "1c0af181",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cross-validation scores: [0.83050847 0.75423729 0.75423729 0.72033898 0.8559322 ]\n",
"Average cross-validation score: 0.7830508474576271\n",
" precision recall f1-score support\n",
"\n",
" 0 0.85 0.95 0.90 41\n",
" 1 0.71 0.77 0.74 39\n",
" 2 0.90 0.71 0.79 38\n",
"\n",
" accuracy 0.81 118\n",
" macro avg 0.82 0.81 0.81 118\n",
"weighted avg 0.82 0.81 0.81 118\n",
"\n"
]
}
],
"source": [
"from sklearn import svm\n",
"from sklearn.model_selection import train_test_split, cross_val_score\n",
"from sklearn.metrics import classification_report\n",
"\n",
"# Initialize SVM model\n",
"svm_model = svm.SVC(kernel='rbf', C=100, gamma='scale', probability=True)\n",
"\n",
"# Perform cross-validation (CV=5)\n",
"cv_scores = cross_val_score(svm_model, x, y, cv=5)\n",
"\n",
"# Print cross-validation scores for each fold\n",
"print(\"Cross-validation scores:\", cv_scores)\n",
"\n",
"# Calculate and print average cross-validation score\n",
"avg_cv_score = cv_scores.mean()\n",
"print(\"Average cross-validation score:\", avg_cv_score)\n",
"\n",
"# Train and evaluate the model with the test set\n",
"svm_model.fit(x_train, y_train)\n",
"y_pred1 = svm_model.predict(x_test)\n",
"print(classification_report(y_test, y_pred1))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "811920c0",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x432 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"cm = confusion_matrix(y_test, y_pred1)\n",
"\n",
"# Define the labels for the confusion matrix\n",
"classes = ['Healthy', 'Non-cancerous', 'Pancreatic cancer']\n",
"\n",
"# Plot the confusion matrix\n",
"plt.figure(figsize=(6, 6))\n",
"plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)\n",
"plt.title('Confusion matrix')\n",
"plt.colorbar()\n",
"tick_marks = np.arange(len(classes))\n",
"plt.xticks(tick_marks, classes, rotation=45)\n",
"plt.yticks(tick_marks, classes)\n",
"fmt = '.2f'\n",
"thresh = cm.max() / 2.\n",
"for i, j in np.ndindex(cm.shape):\n",
" plt.text(j, i, format(cm[i, j], fmt),\n",
" horizontalalignment=\"center\",\n",
" color=\"white\" if cm[i, j] > thresh else \"black\")\n",
"plt.xlabel('Predicted label')\n",
"plt.ylabel('True label')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "c0847ae7",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"from sklearn.svm import SVC\n",
"from sklearn.metrics import roc_curve, auc\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.inspection import permutation_importance\n",
"\n",
"# Predict the class probabilities of the testing data\n",
"y_prob = svm_model.predict_proba(x_test)\n",
"\n",
"# Compute the ROC curve and AUC for each class\n",
"fpr = {}\n",
"tpr = {}\n",
"roc_auc = {}\n",
"for i in range(3):\n",
" fpr[i], tpr[i], _ = roc_curve(y_test == i, y_prob[:, i])\n",
" roc_auc[i] = auc(fpr[i], tpr[i])\n",
"\n",
"# Plot the ROC curves and AUCs for each class\n",
"plt.figure()\n",
"lw = 2\n",
"colors = ['red', 'blue', 'green']\n",
"for i, color in zip(range(3), colors):\n",
" plt.plot(fpr[i], tpr[i], color=color, lw=lw,\n",
" label='ROC curve of class {0} (area = {1:0.2f})'\n",
" ''.format(i, roc_auc[i]))\n",
" plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')\n",
" plt.xlim([0.0, 1.0])\n",
" plt.ylim([0.0, 1.05])\n",
" plt.xlabel('False Positive Rate')\n",
" plt.ylabel('True Positive Rate')\n",
" plt.title('ROC curves and AUCs for each class')\n",
" plt.legend(loc=\"lower right\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "b2a9b6c3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text(0, 0.5, 'Feautures')"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 720x432 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"perm_importance = permutation_importance(svm_model, x_test, y_test)\n",
"\n",
"feature_names = df.columns # Replace with your feature names\n",
"\n",
"features = np.array(feature_names)\n",
"sorted_idx = perm_importance.importances_mean.argsort()\n",
"plt.figure(figsize=(10, 6))\n",
"plt.barh(features[sorted_idx], perm_importance.importances_mean[sorted_idx])\n",
"plt.title('Support Vector Machine - Feature Importance')\n",
"plt.xlabel(\"Feauture Importance\")\n",
"plt.ylabel(\"Feautures\")"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "f0475908",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0. , -0.00847458, 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , 0. , 0. ],\n",
" [ 0.00847458, 0.02542373, -0.00847458, 0.03389831, 0.00847458],\n",
" [ 0.01694915, 0.00847458, 0.04237288, 0.04237288, -0.00847458],\n",
" [ 0.13559322, 0.13559322, 0.18644068, 0.16101695, 0.1440678 ],\n",
" [ 0.37288136, 0.3559322 , 0.33050847, 0.28813559, 0.3220339 ]])"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"perm_importance.importances[sorted_idx]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "8983c3e5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training accuracy 84.53389830508475\n",
"Test accuracy 81.35593220338984\n"
]
}
],
"source": [
"print('Training accuracy', np.mean(svm_model.predict(x_train) == y_train)*100)\n",
"print('Test accuracy', np.mean(svm_model.predict(x_test) == y_test)*100)"
]
},
{
"cell_type": "markdown",
"id": "9dad11b6",
"metadata": {},
"source": [
"# XGBoost"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "3b8a7b51",
"metadata": {},
"outputs": [],
"source": [
"from xgboost import XGBClassifier\n",
"import xgboost as xgb\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.metrics import confusion_matrix, classification_report\n",
"from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score\n",
"from sklearn.utils.class_weight import compute_sample_weight"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "ccef7809",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 5 folds for each of 405 candidates, totalling 2025 fits\n",
"Best hyperparameters: {'colsample_bytree': 0.6, 'gamma': 1, 'max_depth': 4, 'min_child_weight': 5, 'subsample': 0.8}\n",
"Best score: 0.8982530795072788\n"
]
}
],
"source": [
"import numpy as np\n",
"import xgboost as xgb\n",
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"# Create an XGBoost classifier object with the desired hyperparameters\n",
"clf = xgb.XGBClassifier(objective='multi:softmax', num_class=3, missing=1)\n",
"\n",
"# Define the hyperparameters to tune and their possible values\n",
"param_grid = {\n",
" 'min_child_weight': [1, 5, 10],\n",
" 'gamma': [0.5, 1, 1.5, 2, 5],\n",
" 'subsample': [0.6, 0.8, 1.0],\n",
" 'colsample_bytree': [0.6, 0.8, 1.0],\n",
" 'max_depth': [3, 4, 5]\n",
"}\n",
"\n",
"# Create a grid search object with the XGBoost classifier and the hyperparameters to tune\n",
"grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, n_jobs = 2, verbose = 2, scoring='accuracy')\n",
"\n",
"# Fit the grid search object on the training data\n",
"grid_search.fit(x_train, y_train)\n",
"\n",
"# Print the best hyperparameters and their corresponding score\n",
"print('Best hyperparameters:', grid_search.best_params_)\n",
"print('Best score:', grid_search.best_score_)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "a21ef45a",
"metadata": {},
"outputs": [],
"source": [
"xgb = XGBClassifier(colsample_bytree = 0.6,\n",
"gamma = 1, \n",
"max_depth = 4,\n",
"min_child_weight = 5,\n",
"subsample = 0.8)\n",
"xgb_tuned = xgb.fit(x_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "85d2cfa1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.923728813559322"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_pred = xgb_tuned.predict(x_test)\n",
"accuracy_score(y_test, y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "2c078569",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"import xgboost as xgb\n",
"from sklearn.metrics import roc_curve, auc\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Predict the class probabilities of the testing data\n",
"y_prob = xgb_tuned.predict_proba(x_test)\n",
"\n",
"# Compute the ROC curve and AUC for each class\n",
"fpr = {}\n",
"tpr = {}\n",
"roc_auc = {}\n",
"for i in range(3):\n",
" fpr[i], tpr[i], _ = roc_curve(y_test == i, y_prob[:, i])\n",
" roc_auc[i] = auc(fpr[i], tpr[i])\n",
"\n",
"# Plot the ROC curves and AUCs for each class\n",
"plt.figure()\n",
"lw = 2\n",
"colors = ['red', 'blue', 'green']\n",
"for i, color in zip(range(3), colors):\n",
" plt.plot(fpr[i], tpr[i], color=color, lw=lw,\n",
" label='ROC curve of class {0} (area = {1:0.2f})'\n",
" ''.format(i, roc_auc[i]))\n",
" plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')\n",
" plt.xlim([0.0, 1.0])\n",
" plt.ylim([0.0, 1.05])\n",
" plt.xlabel('False Positive Rate')\n",
" plt.ylabel('True Positive Rate')\n",
" plt.title('ROC curves and AUCs for each class')\n",
" plt.legend(loc=\"lower right\")\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"id": "5c34f0e0",
"metadata": {},
"source": [
"the results show that the classifier has high performance for all three classes, with AUC values of 1 for class 0, 0.97 for class 1, and 0.99 for class 2. This indicates that the classifier is doing an excellent job of distinguishing between the three classes. The ROC curves for each class are plotted in different colors, and the legend indicates the AUC value for each class. The black dashed line represents the ROC curve for a random classifier."
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "dfa19bed",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x432 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"cm = confusion_matrix(y_test, y_pred)\n",
"\n",
"# Define the labels for the confusion matrix\n",
"classes = ['Healthy', 'Non-cancerous', 'Pancreatic cancer']\n",
"\n",
"# Plot the confusion matrix\n",
"plt.figure(figsize=(6, 6))\n",
"plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)\n",
"plt.title('Confusion matrix')\n",
"plt.colorbar()\n",
"tick_marks = np.arange(len(classes))\n",
"plt.xticks(tick_marks, classes, rotation=45)\n",
"plt.yticks(tick_marks, classes)\n",
"fmt = '.2f'\n",
"thresh = cm.max() / 2.\n",
"for i, j in np.ndindex(cm.shape):\n",
" plt.text(j, i, format(cm[i, j], fmt),\n",
" horizontalalignment=\"center\",\n",
" color=\"white\" if cm[i, j] > thresh else \"black\")\n",
"plt.xlabel('Predicted label')\n",
"plt.ylabel('True label')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "d2d05336",
"metadata": {},
"source": [
"Class 0 (healthy): There were 37 examples that were correctly classified as healthy (true negatives), 4 examples that were misclassified as non-cancerous pancreas condition (false positives), and 0 examples that were misclassified as pancreatic cancer (false positives).\n",
"\n",
"Class 1 (non-cancerous pancreas condition): There were 36 examples that were correctly classified as non-cancerous pancreas condition (true negatives), 0 examples that were misclassified as healthy (false negatives), and 3 examples that were misclassified as pancreatic cancer (false positives).\n",
"\n",
"Class 2 (pancreatic cancer): There were 36 examples that were correctly classified as pancreatic cancer (true negatives), 2 examples that were misclassified as healthy (false negatives), and 0 examples that were misclassified as non-cancerous pancreas condition (false positives)."
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "fea7425f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"-------------------- Key Metrics --------------------\n",
"\n",
"Accuracy: 0.92\n",
"Micro Precision: 0.92\n",
"Micro Recall: 0.92\n",
"Micro F1-score: 0.92\n",
"\n",
"Macro Precision: 0.92\n",
"Macro Recall: 0.92\n",
"Macro F1-score: 0.92\n",
"\n",
"Weighted Precision: 0.93\n",
"Weighted Recall: 0.92\n",
"Weighted F1-score: 0.92\n",
"\n",
"--------------- Classification Report ---------------\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 1.00 0.93 0.96 41\n",
" 1 0.88 0.90 0.89 39\n",
" 2 0.90 0.95 0.92 38\n",
"\n",
" accuracy 0.92 118\n",
" macro avg 0.92 0.92 0.92 118\n",
"weighted avg 0.93 0.92 0.92 118\n",
"\n",
"---------------------- XGBoost ----------------------\n"
]
}
],
"source": [
"print('\\n-------------------- Key Metrics --------------------')\n",
"print('\\nAccuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))\n",
"\n",
"print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))\n",
"print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))\n",
"print('Micro F1-score: {:.2f}\\n'.format(f1_score(y_test, y_pred, average='micro')))\n",
"\n",
"print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))\n",
"print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))\n",
"print('Macro F1-score: {:.2f}\\n'.format(f1_score(y_test, y_pred, average='macro')))\n",
"\n",
"print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))\n",
"print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))\n",
"print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))\n",
"\n",
"print('\\n--------------- Classification Report ---------------\\n')\n",
"print(classification_report(y_test, y_pred))\n",
"print('---------------------- XGBoost ----------------------') # unnecessary fancy styling"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "73bd98ce",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"from xgboost import plot_importance\n",
"# plot feature importance\n",
"plot_importance(xgb)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "ef8a1272",
"metadata": {},
"source": [
"# Random Forest"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "941f40b4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-2 {color: black;background-color: white;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),\n",
" param_grid={'bootstrap': [True, False],\n",
" 'criterion': ['gini', 'entropy'],\n",
" 'max_depth': [4, 5, 6, 7, 8],\n",
" 'min_samples_leaf': [1, 2, 4],\n",
" 'min_samples_split': [2, 5, 10],\n",
" 'n_estimators': [100, 200, 500]})</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">GridSearchCV</label><div class=\"sk-toggleable__content\"><pre>GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),\n",
" param_grid={'bootstrap': [True, False],\n",
" 'criterion': ['gini', 'entropy'],\n",
" 'max_depth': [4, 5, 6, 7, 8],\n",
" 'min_samples_leaf': [1, 2, 4],\n",
" 'min_samples_split': [2, 5, 10],\n",
" 'n_estimators': [100, 200, 500]})</pre></div></div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">estimator: RandomForestClassifier</label><div class=\"sk-toggleable__content\"><pre>RandomForestClassifier(random_state=42)</pre></div></div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-6\" type=\"checkbox\" ><label for=\"sk-estimator-id-6\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">RandomForestClassifier</label><div class=\"sk-toggleable__content\"><pre>RandomForestClassifier(random_state=42)</pre></div></div></div></div></div></div></div></div></div></div>"
],
"text/plain": [
"GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),\n",
" param_grid={'bootstrap': [True, False],\n",
" 'criterion': ['gini', 'entropy'],\n",
" 'max_depth': [4, 5, 6, 7, 8],\n",
" 'min_samples_leaf': [1, 2, 4],\n",
" 'min_samples_split': [2, 5, 10],\n",
" 'n_estimators': [100, 200, 500]})"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"# Create a RandomForestClassifier object\n",
"clf = RandomForestClassifier(random_state=42)\n",
"\n",
"# Define the hyperparameters to tune\n",
"param_grid = {\n",
" 'n_estimators': [100, 200, 500],\n",
" 'max_depth': [4,5,6,7,8],\n",
" 'min_samples_split': [2, 5, 10],\n",
" 'min_samples_leaf': [1, 2, 4],\n",
" 'bootstrap': [True, False],\n",
" 'criterion': ['gini', 'entropy']\n",
"}\n",
"\n",
"# Create a GridSearchCV object with 5-fold cross-validation\n",
"grid_search = GridSearchCV(clf, param_grid, cv=5)\n",
"\n",
"# Train the GridSearchCV object on the training data\n",
"grid_search.fit(x_train, y_train)\n"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "70bea1b3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best hyperparameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}\n",
"Accuracy: 0.8898305084745762\n"
]
}
],
"source": [
"# Get the best hyperparameters and the corresponding classifier\n",
"best_params = grid_search.best_params_\n",
"best_clf = grid_search.best_estimator_\n",
"\n",
"# Predict the class labels of the testing data using the best classifier\n",
"y_pred = best_clf.predict(x_test)\n",
"\n",
"# Compute the accuracy of the classifier\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"\n",
"# Print the best hyperparameters and the accuracy of the classifier\n",
"print(\"Best hyperparameters:\", best_params)\n",
"print(\"Accuracy:\", accuracy)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "a35e5f41",
"metadata": {},
"outputs": [],
"source": [
"rfc = RandomForestClassifier (\n",
" criterion = 'gini',\n",
" max_depth = 4,\n",
" bootstrap = False,\n",
" min_samples_leaf = 1,\n",
" min_samples_split = 10,\n",
" n_estimators = 200\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "f8d6ec61",
"metadata": {},
"outputs": [],
"source": [
"rfc_tuned = rfc.fit(x_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "dbd70d2c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8813559322033898"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_pred = rfc_tuned.predict(x_test)\n",
"accuracy_score(y_test, y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "1c86dbdf",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x432 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"cm = confusion_matrix(y_test, y_pred)\n",
"\n",
"# Define the labels for the confusion matrix\n",
"classes = ['Healthy', 'Non-cancerous', 'Pancreatic cancer']\n",
"\n",
"# Plot the confusion matrix\n",
"plt.figure(figsize=(6, 6))\n",
"plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)\n",
"plt.title('Confusion matrix')\n",
"plt.colorbar()\n",
"tick_marks = np.arange(len(classes))\n",
"plt.xticks(tick_marks, classes, rotation=45)\n",
"plt.yticks(tick_marks, classes)\n",
"fmt = '.2f'\n",
"thresh = cm.max() / 2.\n",
"for i, j in np.ndindex(cm.shape):\n",
" plt.text(j, i, format(cm[i, j], fmt),\n",
" horizontalalignment=\"center\",\n",
" color=\"white\" if cm[i, j] > thresh else \"black\")\n",
"plt.xlabel('Predicted label')\n",
"plt.ylabel('True label')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "67d1fd8d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"-------------------- Key Metrics --------------------\n",
"\n",
"Accuracy: 0.91\n",
"Micro Precision: 0.91\n",
"Micro Recall: 0.91\n",
"Micro F1-score: 0.91\n",
"\n",
"Macro Precision: 0.91\n",
"Macro Recall: 0.91\n",
"Macro F1-score: 0.91\n",
"\n",
"Weighted Precision: 0.91\n",
"Weighted Recall: 0.91\n",
"Weighted F1-score: 0.91\n",
"\n",
"--------------- Classification Report ---------------\n",
"\n",
" precision recall f1-score support\n",
"\n",
" 0 0.97 0.90 0.94 41\n",
" 1 0.85 0.87 0.86 39\n",
" 2 0.90 0.95 0.92 38\n",
"\n",
" accuracy 0.91 118\n",
" macro avg 0.91 0.91 0.91 118\n",
"weighted avg 0.91 0.91 0.91 118\n",
"\n",
"---------------------- Random Forest ----------------------\n"
]
}
],
"source": [
"print('\\n-------------------- Key Metrics --------------------')\n",
"print('\\nAccuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))\n",
"\n",
"print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))\n",
"print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))\n",
"print('Micro F1-score: {:.2f}\\n'.format(f1_score(y_test, y_pred, average='micro')))\n",
"\n",
"print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))\n",
"print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))\n",
"print('Macro F1-score: {:.2f}\\n'.format(f1_score(y_test, y_pred, average='macro')))\n",
"\n",
"print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))\n",
"print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))\n",
"print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))\n",
"\n",
"print('\\n--------------- Classification Report ---------------\\n')\n",
"print(classification_report(y_test, y_pred))\n",
"print('---------------------- Random Forest ----------------------') # unnecessary fancy styling"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "1e432882",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import numpy as np\n",
"import xgboost as xgb\n",
"from sklearn.metrics import roc_curve, auc\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Predict the class probabilities of the testing data\n",
"y_prob = rfc_tuned.predict_proba(x_test)\n",
"\n",
"# Compute the ROC curve and AUC for each class\n",
"fpr = {}\n",
"tpr = {}\n",
"roc_auc = {}\n",
"for i in range(3):\n",
" fpr[i], tpr[i], _ = roc_curve(y_test == i, y_prob[:, i])\n",
" roc_auc[i] = auc(fpr[i], tpr[i])\n",
"\n",
"# Plot the ROC curves and AUCs for each class\n",
"plt.figure()\n",
"lw = 2\n",
"colors = ['red', 'blue', 'green']\n",
"for i, color in zip(range(3), colors):\n",
" plt.plot(fpr[i], tpr[i], color=color, lw=lw,\n",
" label='ROC curve of class {0} (area = {1:0.2f})'\n",
" ''.format(i, roc_auc[i]))\n",
" plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')\n",
" plt.xlim([0.0, 1.0])\n",
" plt.ylim([0.0, 1.05])\n",
" plt.xlabel('False Positive Rate')\n",
" plt.ylabel('True Positive Rate')\n",
" plt.title('ROC curves and AUCs for each class')\n",
" plt.legend(loc=\"lower right\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "c2b10656",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature 'plasma_CA19_9': Importance score 0.023456563152226412\n",
"Feature 'age': Importance score 0.056031320629402935\n",
"Feature 'LYVE1': Importance score 0.06168900062137468\n",
"Feature 'REG1B': Importance score 0.07328207477185975\n",
"Feature 'creatinine': Importance score 0.10190630653592864\n",
"Feature 'TFF1': Importance score 0.24920307196419297\n",
"Feature 'diagnosis': Importance score 0.42972559808785915\n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 720x432 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Get feature importance scores\n",
"feature_importance = rfc_tuned.feature_importances_\n",
"\n",
"# Sort feature importance scores in descending order\n",
"sorted_indices = np.argsort(feature_importance)[1::]\n",
"sorted_feature_importance = feature_importance[sorted_indices]\n",
"\n",
"# Print feature importance scores\n",
"for feature_idx, importance_score in zip(sorted_indices, sorted_feature_importance):\n",
" feature_name = df.columns[feature_idx]\n",
" print(f\"Feature '{feature_name}': Importance score {importance_score}\")\n",
"\n",
"# Optionally, visualize feature importance scores\n",
"import matplotlib.pyplot as plt\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"plt.barh(range(len(sorted_feature_importance)), sorted_feature_importance)\n",
"plt.yticks(range(len(sorted_feature_importance)), [df.columns[idx] for idx in sorted_indices])\n",
"plt.xlabel('Feature')\n",
"plt.ylabel('Importance Score')\n",
"plt.title('Feature Importance')\n",
"plt.xticks(rotation=90)\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8 (tensorflow)",
"language": "python",
"name": "tensorflow"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
}
},
"nbformat": 4,
"nbformat_minor": 5
}