--- a +++ b/2a-preprocess-pickle-study_to_data.ipynb @@ -0,0 +1,695 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import fastai\n", + "import pickle\n", + "from fastai.utils.mem import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "stage = 'stage_2'\n", + "data_dir = Path('data')\n", + "fn_to_study_ix = pickle.load(open(f'{data_dir}/{stage}_fn_to_study_ix.pickle', 'rb'))\n", + "study_ix_to_fn = pickle.load(open(f'{data_dir}/{stage}_study_ix_to_fn.pickle', 'rb'))\n", + "fn_to_labels = pickle.load( open(f'{data_dir}/{stage}_train_fn_to_labels.pickle', 'rb'))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>fn</th>\n", + " <th>study</th>\n", + " <th>any</th>\n", + " <th>epidural</th>\n", + " <th>intraparenchymal</th>\n", + " <th>intraventricular</th>\n", + " <th>subarachnoid</th>\n", + " <th>subdural</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>ID_76d55d9d0</td>\n", + " <td>ID_0000298a7d</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>ID_96d282ea9</td>\n", + " <td>ID_0000298a7d</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>ID_7d8a7c29d</td>\n", + " <td>ID_0000298a7d</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>ID_4d4401491</td>\n", + " <td>ID_0000298a7d</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>ID_8f5ded0b7</td>\n", + " <td>ID_0000298a7d</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " <td>False</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " fn study any epidural intraparenchymal \\\n", + "0 ID_76d55d9d0 ID_0000298a7d False False False \n", + "1 ID_96d282ea9 ID_0000298a7d False False False \n", + "2 ID_7d8a7c29d ID_0000298a7d False False False \n", + "3 ID_4d4401491 ID_0000298a7d False False False \n", + "4 ID_8f5ded0b7 ID_0000298a7d False False False \n", + "\n", + " intraventricular subarachnoid subdural \n", + "0 False False False \n", + "1 False False False \n", + "2 False False False \n", + "3 False False False \n", + "4 False False False " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classes = ['any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']\n", + "labels = []\n", + "for fn, lbls in fn_to_labels.items():\n", + " row = {'fn':fn,'study':fn_to_study_ix[fn][0]}\n", + " for c in classes:\n", + " row[c] = c in fn_to_labels[fn]\n", + " labels.append(row)\n", + "\n", + "labels_df = pd.DataFrame(labels)\n", + "labels_df.head() " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>any</th>\n", + " <th>epidural</th>\n", + " <th>intraparenchymal</th>\n", + " <th>intraventricular</th>\n", + " <th>subarachnoid</th>\n", + " <th>subdural</th>\n", + " <th>any_size</th>\n", + " <th>epidural_size</th>\n", + " <th>intraparenchymal_size</th>\n", + " <th>intraventricular_size</th>\n", + " <th>subarachnoid_size</th>\n", + " <th>subdural_size</th>\n", + " <th>strat_class</th>\n", + " </tr>\n", + " <tr>\n", + " <th>study</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>ID_0c828b6688</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_fe788cb8e6</th>\n", + " <td>6.0</td>\n", + " <td>0.0</td>\n", + " <td>2.0</td>\n", + " <td>0.0</td>\n", + " <td>4.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>101010.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_676bad1cd8</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_d0df6455dc</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_db74a340fd</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_29ecf20f76</th>\n", + " <td>7.0</td>\n", + " <td>0.0</td>\n", + " <td>5.0</td>\n", + " <td>5.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>101100.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_2d80aed85a</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_d9cf56413e</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_a2d95718f5</th>\n", + " <td>15.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>15.0</td>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " <td>200002.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_8fd4e085bf</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " any epidural intraparenchymal intraventricular \\\n", + "study \n", + "ID_0c828b6688 0.0 0.0 0.0 0.0 \n", + "ID_fe788cb8e6 6.0 0.0 2.0 0.0 \n", + "ID_676bad1cd8 0.0 0.0 0.0 0.0 \n", + "ID_d0df6455dc 0.0 0.0 0.0 0.0 \n", + "ID_db74a340fd 0.0 0.0 0.0 0.0 \n", + "ID_29ecf20f76 7.0 0.0 5.0 5.0 \n", + "ID_2d80aed85a 0.0 0.0 0.0 0.0 \n", + "ID_d9cf56413e 0.0 0.0 0.0 0.0 \n", + "ID_a2d95718f5 15.0 0.0 0.0 0.0 \n", + "ID_8fd4e085bf 0.0 0.0 0.0 0.0 \n", + "\n", + " subarachnoid subdural any_size epidural_size \\\n", + "study \n", + "ID_0c828b6688 0.0 0.0 0 0 \n", + "ID_fe788cb8e6 4.0 0.0 1 0 \n", + "ID_676bad1cd8 0.0 0.0 0 0 \n", + "ID_d0df6455dc 0.0 0.0 0 0 \n", + "ID_db74a340fd 0.0 0.0 0 0 \n", + "ID_29ecf20f76 0.0 0.0 1 0 \n", + "ID_2d80aed85a 0.0 0.0 0 0 \n", + "ID_d9cf56413e 0.0 0.0 0 0 \n", + "ID_a2d95718f5 0.0 15.0 2 0 \n", + "ID_8fd4e085bf 0.0 0.0 0 0 \n", + "\n", + " intraparenchymal_size intraventricular_size \\\n", + "study \n", + "ID_0c828b6688 0 0 \n", + "ID_fe788cb8e6 1 0 \n", + "ID_676bad1cd8 0 0 \n", + "ID_d0df6455dc 0 0 \n", + "ID_db74a340fd 0 0 \n", + "ID_29ecf20f76 1 1 \n", + "ID_2d80aed85a 0 0 \n", + "ID_d9cf56413e 0 0 \n", + "ID_a2d95718f5 0 0 \n", + "ID_8fd4e085bf 0 0 \n", + "\n", + " subarachnoid_size subdural_size strat_class \n", + "study \n", + "ID_0c828b6688 0 0 0.0 \n", + "ID_fe788cb8e6 1 0 101010.0 \n", + "ID_676bad1cd8 0 0 0.0 \n", + "ID_d0df6455dc 0 0 0.0 \n", + "ID_db74a340fd 0 0 0.0 \n", + "ID_29ecf20f76 0 0 101100.0 \n", + "ID_2d80aed85a 0 0 0.0 \n", + "ID_d9cf56413e 0 0 0.0 \n", + "ID_a2d95718f5 0 2 200002.0 \n", + "ID_8fd4e085bf 0 0 0.0 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "study_labels = labels_df.groupby('study').agg('sum').reindex()\n", + "for c in classes :\n", + " study_labels[c+'_size'] = 0 # 0 :not present, 1:small, 2:big\n", + " c_idx = study_labels.query(f'{c}>0').sort_values(c).index\n", + " c_small, c_big = c_idx[:len(c_idx)//2], c_idx[len(c_idx)//2:]\n", + " study_labels.loc[c_small,c+'_size'] = 1\n", + " study_labels.loc[c_big,c+'_size'] = 2\n", + "size_classes = [c+'_size' for c in classes]\n", + "study_labels['strat_class'] = study_labels[size_classes].astype(str).sum(axis=1)\n", + "study_labels.sample(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/antor/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_split.py:667: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=19.\n", + " % (min_groups, self.n_splits)), UserWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>any</th>\n", + " <th>epidural</th>\n", + " <th>intraparenchymal</th>\n", + " <th>intraventricular</th>\n", + " <th>subarachnoid</th>\n", + " <th>subdural</th>\n", + " <th>any_size</th>\n", + " <th>epidural_size</th>\n", + " <th>intraparenchymal_size</th>\n", + " <th>intraventricular_size</th>\n", + " <th>subarachnoid_size</th>\n", + " <th>subdural_size</th>\n", + " <th>strat_class</th>\n", + " <th>fold</th>\n", + " </tr>\n", + " <tr>\n", + " <th>study</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>ID_0000298a7d</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_0004c4e54c</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>15</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_000a935543</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_000f6fd7db</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>9</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_0010b2528e</th>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0.0</td>\n", + " <td>6</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " any epidural intraparenchymal intraventricular \\\n", + "study \n", + "ID_0000298a7d 0.0 0.0 0.0 0.0 \n", + "ID_0004c4e54c 0.0 0.0 0.0 0.0 \n", + "ID_000a935543 0.0 0.0 0.0 0.0 \n", + "ID_000f6fd7db 0.0 0.0 0.0 0.0 \n", + "ID_0010b2528e 0.0 0.0 0.0 0.0 \n", + "\n", + " subarachnoid subdural any_size epidural_size \\\n", + "study \n", + "ID_0000298a7d 0.0 0.0 0 0 \n", + "ID_0004c4e54c 0.0 0.0 0 0 \n", + "ID_000a935543 0.0 0.0 0 0 \n", + "ID_000f6fd7db 0.0 0.0 0 0 \n", + "ID_0010b2528e 0.0 0.0 0 0 \n", + "\n", + " intraparenchymal_size intraventricular_size \\\n", + "study \n", + "ID_0000298a7d 0 0 \n", + "ID_0004c4e54c 0 0 \n", + "ID_000a935543 0 0 \n", + "ID_000f6fd7db 0 0 \n", + "ID_0010b2528e 0 0 \n", + "\n", + " subarachnoid_size subdural_size strat_class fold \n", + "study \n", + "ID_0000298a7d 0 0 0.0 6 \n", + "ID_0004c4e54c 0 0 0.0 15 \n", + "ID_000a935543 0 0 0.0 2 \n", + "ID_000f6fd7db 0 0 0.0 9 \n", + "ID_0010b2528e 0 0 0.0 6 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import StratifiedKFold\n", + "\n", + "#this will throw a warning that can be ignored\n", + "skf = StratifiedKFold(n_splits=19, shuffle=True, random_state=1972)\n", + "study_labels['fold'] = -1\n", + "for fold, (oof_idx,f_idx) in enumerate(skf.split(study_labels, study_labels.strat_class)):\n", + " study_labels.loc[study_labels.iloc[f_idx].index, 'fold' ] = fold\n", + "study_labels.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "study_to_data = {}\n", + "for study in study_ix_to_fn.keys():\n", + " if study in study_labels.index:\n", + " study_to_data[study] = {'fold': study_labels.loc[study].fold}\n", + " else:\n", + " study_to_data[study] = {'fold': -1} #study not in label set\n", + "pickle.dump(study_to_data, open(f\"data/{stage}_study_to_data.pickle\", \"wb\" ))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}