--- a +++ b/notebooks/symlinks_subset.ipynb @@ -0,0 +1,202 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os\n", + "import shutil\n", + "import sys\n", + "import numpy\n", + "import sklearn\n", + "from sklearn.model_selection import train_test_split, cross_val_score\n", + "from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, roc_curve" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "t = time.time()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(83,)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "infile = \"/repos/tables/glom_xml_split.tab\"\n", + "df = pd.read_table(infile, usecols=[\"file_id\", \"split\"])\n", + "# df['png'] = df.file_id.map(lambda x: x+\".png\")\n", + "df = df.set_index(\"file_id\")[\"split\"]\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# df.set_index('id', inplace=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# indir = \"/repos/data/glom/data_1024/glom_split/all\"\n", + "# indir = \"/repos/data/glom/data_512_subsample_2x/glom_split/all\"\n", + "indir = \"/repos/data/glom/data_256_subsample_4x/glom_split/all\"\n", + "indir = \"/repos/data/glom/data_128_subsample_8x/glom_split/all\"\n", + "outdir = os.path.dirname(indir.rstrip('/'))\n", + "def datagen(indir):\n", + " for dd in os.scandir(indir):\n", + " for ff in os.scandir(dd.path):\n", + "# if os.path.isdir(ff.path) or not (ff.name.endswith(\"png\") or ff.name.endswith(\"json\")):\n", + " if os.path.isdir(ff.path) or not (ff.name.endswith(\"json\")):\n", + " continue\n", + "# print(ff.name.split('-')[0], ff.path)\n", + " yield (ff.name.split('-')[0], ff)\n", + " \n", + "def gen_set(indir, outdir, df):\n", + " for slideid, ff in datagen(indir):\n", + " posnegset = os.path.basename(os.path.dirname(ff.path))\n", + " set_ = df.loc[slideid]\n", + " yield ff.path, os.path.join(outdir, set_, posnegset, ff.name)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11710" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gen = gen_set(indir, outdir, df)\n", + "sum((1 for _ in gen))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/repos/data/glom/data_128_subsample_8x/glom_split/train/normal\n", + "/repos/data/glom/data_128_subsample_8x/glom_split/train/glom\n", + "/repos/data/glom/data_128_subsample_8x/glom_split/test/normal\n", + "/repos/data/glom/data_128_subsample_8x/glom_split/test/glom\n", + "/repos/data/glom/data_128_subsample_8x/glom_split/val/normal\n", + "/repos/data/glom/data_128_subsample_8x/glom_split/val/glom\n" + ] + } + ], + "source": [ + "posnegset = os.listdir(indir)\n", + "for _, set_ in df.drop_duplicates().items():\n", + " for pn in posnegset:\n", + " setdir = os.path.join(outdir, set_, pn)\n", + " print(setdir)\n", + " os.makedirs(setdir, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "1000\n", + "2000\n", + "3000\n", + "4000\n", + "5000\n", + "6000\n", + "7000\n", + "8000\n", + "9000\n", + "10000\n", + "11000\n" + ] + } + ], + "source": [ + "gen = gen_set(indir, outdir, df)\n", + "for nn, (ifn, ofn) in enumerate(gen):\n", + " try:\n", + " os.symlink(ifn, ofn)\n", + " except FileExistsError as ee:\n", + " print(ee)\n", + " continue\n", + " if nn % 1000 == 0:\n", + " print(nn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}