--- a +++ b/1. Applying AI to 2D Medical Imaging Data/8. Obtaining a Gold Standard Exercise/solution.ipynb @@ -0,0 +1,583 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read in your label data:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>rad1</th>\n", + " <th>rad2</th>\n", + " <th>rad3</th>\n", + " <th>biopsy</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>cancer</td>\n", + " <td>benign</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>cancer</td>\n", + " <td>cancer</td>\n", + " <td>cancer</td>\n", + " <td>cancer</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>cancer</td>\n", + " <td>cancer</td>\n", + " <td>benign</td>\n", + " <td>cancer</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>benign</td>\n", + " <td>benign</td>\n", + " <td>cancer</td>\n", + " <td>benign</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " rad1 rad2 rad3 biopsy\n", + "0 benign benign benign benign\n", + "1 benign benign benign benign\n", + "2 benign benign benign benign\n", + "3 benign benign benign benign\n", + "4 benign benign cancer benign\n", + "5 cancer cancer cancer cancer\n", + "6 benign benign benign benign\n", + "7 benign benign benign benign\n", + "8 cancer cancer benign cancer\n", + "9 benign benign cancer benign" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "labels = pd.read_csv('labels.csv')\n", + "labels.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create your first ground truth as derived from biopsy labels: " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>rad1</th>\n", + " <th>rad2</th>\n", + " <th>rad3</th>\n", + " <th>biopsy</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " rad1 rad2 rad3 biopsy\n", + "0 1 1 1 1\n", + "1 1 1 1 1\n", + "2 1 1 1 1\n", + "3 1 1 1 1\n", + "4 1 1 0 1\n", + "5 0 0 0 0\n", + "6 1 1 1 1\n", + "7 1 1 1 1\n", + "8 0 0 1 0\n", + "9 1 1 0 1" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## I'm going to replace everything in my 'labels' dataframe with 0's and 1's for easier processing later:\n", + "labels2 = labels.replace('benign',1).replace('cancer',0)\n", + "labels2.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 1\n", + "2 1\n", + "3 1\n", + "4 1\n", + "Name: biopsy, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gt1 = labels2['biopsy']\n", + "gt1.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create your second truth by voting system from the three radiologists:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1.0\n", + "1 1.0\n", + "2 1.0\n", + "3 1.0\n", + "4 1.0\n", + "dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gt2 = labels2[['rad1','rad2','rad3']].sum(axis=1)\n", + "gt2 = (gt2 > 1).replace(True,1).replace(False,0)\n", + "gt2.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create your third ground truth by weighting the three radiologists:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>rad1</th>\n", + " <th>rad2</th>\n", + " <th>rad3</th>\n", + " <th>biopsy</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0.33</td>\n", + " <td>0.67</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.33</td>\n", + " <td>0.67</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.33</td>\n", + " <td>0.67</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>0.33</td>\n", + " <td>0.67</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>0.33</td>\n", + " <td>0.67</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " rad1 rad2 rad3 biopsy\n", + "0 0.33 0.67 1 1\n", + "1 0.33 0.67 1 1\n", + "2 0.33 0.67 1 1\n", + "3 0.33 0.67 1 1\n", + "4 0.33 0.67 0 1" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weighted_labels = labels2.copy()\n", + "weighted_labels['rad2'] = weighted_labels['rad2'] * 0.67\n", + "weighted_labels['rad1'] = weighted_labels['rad1'] * 0.33\n", + "weighted_labels.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1.0\n", + "1 1.0\n", + "2 1.0\n", + "3 1.0\n", + "4 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gt3 = weighted_labels[['rad1','rad2','rad3']].sum(axis=1)\n", + "gt3 = (gt3 > 1).replace(True,1).replace(False,0)\n", + "gt3.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare the three ground truths:\n", + "\n", + "Here, just explore the three sets of labels you created and see how often they agree" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12 False\n", + "14 False\n", + "22 False\n", + "29 False\n", + "30 False\n", + "34 False\n", + "37 False\n", + "52 False\n", + "57 False\n", + "dtype: bool" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "biopsy_to_votes = gt1 == gt2\n", + "biopsy_to_votes[biopsy_to_votes==False]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4 False\n", + "9 False\n", + "12 False\n", + "14 False\n", + "17 False\n", + "20 False\n", + "22 False\n", + "29 False\n", + "30 False\n", + "34 False\n", + "37 False\n", + "52 False\n", + "56 False\n", + "57 False\n", + "58 False\n", + "dtype: bool" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "biopsy_to_weights = gt1 == gt3\n", + "biopsy_to_weights[biopsy_to_weights==False]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Interestingly, in the example above the weighting example performs worse against biopsy labels than simple voting. This may be an artefact of the weightings that we chose, and is not always sub-optimal to simple voting. " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}