Switch to side-by-side view

--- a
+++ b/notebooks/symlinks_subset.ipynb
@@ -0,0 +1,202 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import os\n",
+    "import shutil\n",
+    "import sys\n",
+    "import numpy\n",
+    "import sklearn\n",
+    "from sklearn.model_selection import train_test_split, cross_val_score\n",
+    "from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, roc_curve"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "t = time.time()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(83,)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "infile = \"/repos/tables/glom_xml_split.tab\"\n",
+    "df = pd.read_table(infile, usecols=[\"file_id\", \"split\"])\n",
+    "# df['png'] = df.file_id.map(lambda x: x+\".png\")\n",
+    "df = df.set_index(\"file_id\")[\"split\"]\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df.set_index('id', inplace=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# indir = \"/repos/data/glom/data_1024/glom_split/all\"\n",
+    "# indir = \"/repos/data/glom/data_512_subsample_2x/glom_split/all\"\n",
+    "indir = \"/repos/data/glom/data_256_subsample_4x/glom_split/all\"\n",
+    "indir = \"/repos/data/glom/data_128_subsample_8x/glom_split/all\"\n",
+    "outdir = os.path.dirname(indir.rstrip('/'))\n",
+    "def datagen(indir):\n",
+    "    for dd in os.scandir(indir):\n",
+    "        for ff in os.scandir(dd.path):\n",
+    "#             if os.path.isdir(ff.path) or not (ff.name.endswith(\"png\") or ff.name.endswith(\"json\")):\n",
+    "            if os.path.isdir(ff.path) or not (ff.name.endswith(\"json\")):\n",
+    "                continue\n",
+    "#             print(ff.name.split('-')[0], ff.path)\n",
+    "            yield (ff.name.split('-')[0], ff)\n",
+    "    \n",
+    "def gen_set(indir, outdir, df):\n",
+    "    for slideid, ff in datagen(indir):\n",
+    "        posnegset = os.path.basename(os.path.dirname(ff.path))\n",
+    "        set_ = df.loc[slideid]\n",
+    "        yield ff.path, os.path.join(outdir, set_, posnegset, ff.name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "11710"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gen = gen_set(indir, outdir, df)\n",
+    "sum((1 for _ in gen))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/repos/data/glom/data_128_subsample_8x/glom_split/train/normal\n",
+      "/repos/data/glom/data_128_subsample_8x/glom_split/train/glom\n",
+      "/repos/data/glom/data_128_subsample_8x/glom_split/test/normal\n",
+      "/repos/data/glom/data_128_subsample_8x/glom_split/test/glom\n",
+      "/repos/data/glom/data_128_subsample_8x/glom_split/val/normal\n",
+      "/repos/data/glom/data_128_subsample_8x/glom_split/val/glom\n"
+     ]
+    }
+   ],
+   "source": [
+    "posnegset = os.listdir(indir)\n",
+    "for _, set_ in df.drop_duplicates().items():\n",
+    "    for pn in posnegset:\n",
+    "        setdir = os.path.join(outdir, set_, pn)\n",
+    "        print(setdir)\n",
+    "        os.makedirs(setdir, exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "1000\n",
+      "2000\n",
+      "3000\n",
+      "4000\n",
+      "5000\n",
+      "6000\n",
+      "7000\n",
+      "8000\n",
+      "9000\n",
+      "10000\n",
+      "11000\n"
+     ]
+    }
+   ],
+   "source": [
+    "gen = gen_set(indir, outdir, df)\n",
+    "for nn, (ifn, ofn) in enumerate(gen):\n",
+    "    try:\n",
+    "        os.symlink(ifn, ofn)\n",
+    "    except FileExistsError as ee:\n",
+    "        print(ee)\n",
+    "        continue\n",
+    "    if nn % 1000 == 0:\n",
+    "        print(nn)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}