584 lines (583 with data), 15.1 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read in your label data:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>rad1</th>\n",
" <th>rad2</th>\n",
" <th>rad3</th>\n",
" <th>biopsy</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>cancer</td>\n",
" <td>benign</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>cancer</td>\n",
" <td>cancer</td>\n",
" <td>cancer</td>\n",
" <td>cancer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>cancer</td>\n",
" <td>cancer</td>\n",
" <td>benign</td>\n",
" <td>cancer</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>benign</td>\n",
" <td>benign</td>\n",
" <td>cancer</td>\n",
" <td>benign</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" rad1 rad2 rad3 biopsy\n",
"0 benign benign benign benign\n",
"1 benign benign benign benign\n",
"2 benign benign benign benign\n",
"3 benign benign benign benign\n",
"4 benign benign cancer benign\n",
"5 cancer cancer cancer cancer\n",
"6 benign benign benign benign\n",
"7 benign benign benign benign\n",
"8 cancer cancer benign cancer\n",
"9 benign benign cancer benign"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"labels = pd.read_csv('labels.csv')\n",
"labels.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create your first ground truth as derived from biopsy labels: "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>rad1</th>\n",
" <th>rad2</th>\n",
" <th>rad3</th>\n",
" <th>biopsy</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" rad1 rad2 rad3 biopsy\n",
"0 1 1 1 1\n",
"1 1 1 1 1\n",
"2 1 1 1 1\n",
"3 1 1 1 1\n",
"4 1 1 0 1\n",
"5 0 0 0 0\n",
"6 1 1 1 1\n",
"7 1 1 1 1\n",
"8 0 0 1 0\n",
"9 1 1 0 1"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## I'm going to replace everything in my 'labels' dataframe with 0's and 1's for easier processing later:\n",
"labels2 = labels.replace('benign',1).replace('cancer',0)\n",
"labels2.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
"Name: biopsy, dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gt1 = labels2['biopsy']\n",
"gt1.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create your second truth by voting system from the three radiologists:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 1.0\n",
"2 1.0\n",
"3 1.0\n",
"4 1.0\n",
"dtype: float64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gt2 = labels2[['rad1','rad2','rad3']].sum(axis=1)\n",
"gt2 = (gt2 > 1).replace(True,1).replace(False,0)\n",
"gt2.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create your third ground truth by weighting the three radiologists:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>rad1</th>\n",
" <th>rad2</th>\n",
" <th>rad3</th>\n",
" <th>biopsy</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.33</td>\n",
" <td>0.67</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.33</td>\n",
" <td>0.67</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.33</td>\n",
" <td>0.67</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.33</td>\n",
" <td>0.67</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.33</td>\n",
" <td>0.67</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" rad1 rad2 rad3 biopsy\n",
"0 0.33 0.67 1 1\n",
"1 0.33 0.67 1 1\n",
"2 0.33 0.67 1 1\n",
"3 0.33 0.67 1 1\n",
"4 0.33 0.67 0 1"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"weighted_labels = labels2.copy()\n",
"weighted_labels['rad2'] = weighted_labels['rad2'] * 0.67\n",
"weighted_labels['rad1'] = weighted_labels['rad1'] * 0.33\n",
"weighted_labels.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 1.0\n",
"2 1.0\n",
"3 1.0\n",
"4 0.0\n",
"dtype: float64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gt3 = weighted_labels[['rad1','rad2','rad3']].sum(axis=1)\n",
"gt3 = (gt3 > 1).replace(True,1).replace(False,0)\n",
"gt3.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compare the three ground truths:\n",
"\n",
"Here, just explore the three sets of labels you created and see how often they agree"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"12 False\n",
"14 False\n",
"22 False\n",
"29 False\n",
"30 False\n",
"34 False\n",
"37 False\n",
"52 False\n",
"57 False\n",
"dtype: bool"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"biopsy_to_votes = gt1 == gt2\n",
"biopsy_to_votes[biopsy_to_votes==False]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4 False\n",
"9 False\n",
"12 False\n",
"14 False\n",
"17 False\n",
"20 False\n",
"22 False\n",
"29 False\n",
"30 False\n",
"34 False\n",
"37 False\n",
"52 False\n",
"56 False\n",
"57 False\n",
"58 False\n",
"dtype: bool"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"biopsy_to_weights = gt1 == gt3\n",
"biopsy_to_weights[biopsy_to_weights==False]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Interestingly, in the example above the weighting example performs worse against biopsy labels than simple voting. This may be an artefact of the weightings that we chose, and is not always sub-optimal to simple voting. "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}