{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read in your label data:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rad1rad2rad3biopsy
0benignbenignbenignbenign
1benignbenignbenignbenign
2benignbenignbenignbenign
3benignbenignbenignbenign
4benignbenigncancerbenign
5cancercancercancercancer
6benignbenignbenignbenign
7benignbenignbenignbenign
8cancercancerbenigncancer
9benignbenigncancerbenign
\n", "
" ], "text/plain": [ " rad1 rad2 rad3 biopsy\n", "0 benign benign benign benign\n", "1 benign benign benign benign\n", "2 benign benign benign benign\n", "3 benign benign benign benign\n", "4 benign benign cancer benign\n", "5 cancer cancer cancer cancer\n", "6 benign benign benign benign\n", "7 benign benign benign benign\n", "8 cancer cancer benign cancer\n", "9 benign benign cancer benign" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "labels = pd.read_csv('labels.csv')\n", "labels.head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create your first ground truth as derived from biopsy labels: " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rad1rad2rad3biopsy
01111
11111
21111
31111
41101
50000
61111
71111
80010
91101
\n", "
" ], "text/plain": [ " rad1 rad2 rad3 biopsy\n", "0 1 1 1 1\n", "1 1 1 1 1\n", "2 1 1 1 1\n", "3 1 1 1 1\n", "4 1 1 0 1\n", "5 0 0 0 0\n", "6 1 1 1 1\n", "7 1 1 1 1\n", "8 0 0 1 0\n", "9 1 1 0 1" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## I'm going to replace everything in my 'labels' dataframe with 0's and 1's for easier processing later:\n", "labels2 = labels.replace('benign',1).replace('cancer',0)\n", "labels2.head(10)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1\n", "1 1\n", "2 1\n", "3 1\n", "4 1\n", "Name: biopsy, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gt1 = labels2['biopsy']\n", "gt1.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create your second truth by voting system from the three radiologists:" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 1.0\n", "2 1.0\n", "3 1.0\n", "4 1.0\n", "dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gt2 = labels2[['rad1','rad2','rad3']].sum(axis=1)\n", "gt2 = (gt2 > 1).replace(True,1).replace(False,0)\n", "gt2.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create your third ground truth by weighting the three radiologists:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rad1rad2rad3biopsy
00.330.6711
10.330.6711
20.330.6711
30.330.6711
40.330.6701
\n", "
" ], "text/plain": [ " rad1 rad2 rad3 biopsy\n", "0 0.33 0.67 1 1\n", "1 0.33 0.67 1 1\n", "2 0.33 0.67 1 1\n", "3 0.33 0.67 1 1\n", "4 0.33 0.67 0 1" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "weighted_labels = labels2.copy()\n", "weighted_labels['rad2'] = weighted_labels['rad2'] * 0.67\n", "weighted_labels['rad1'] = weighted_labels['rad1'] * 0.33\n", "weighted_labels.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 1.0\n", "2 1.0\n", "3 1.0\n", "4 0.0\n", "dtype: float64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gt3 = weighted_labels[['rad1','rad2','rad3']].sum(axis=1)\n", "gt3 = (gt3 > 1).replace(True,1).replace(False,0)\n", "gt3.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Compare the three ground truths:\n", "\n", "Here, just explore the three sets of labels you created and see how often they agree" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "12 False\n", "14 False\n", "22 False\n", "29 False\n", "30 False\n", "34 False\n", "37 False\n", "52 False\n", "57 False\n", "dtype: bool" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "biopsy_to_votes = gt1 == gt2\n", "biopsy_to_votes[biopsy_to_votes==False]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4 False\n", "9 False\n", "12 False\n", "14 False\n", "17 False\n", "20 False\n", "22 False\n", "29 False\n", "30 False\n", "34 False\n", "37 False\n", "52 False\n", "56 False\n", "57 False\n", "58 False\n", "dtype: bool" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "biopsy_to_weights = gt1 == gt3\n", "biopsy_to_weights[biopsy_to_weights==False]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Interestingly, in the example above the weighting example performs worse against biopsy labels than simple voting. This may be an artefact of the weightings that we chose, and is not always sub-optimal to simple voting. " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 2 }