Switch to unified view

a b/notebooks/symlinks_subset.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 2,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "import pandas as pd\n",
10
    "import os\n",
11
    "import shutil\n",
12
    "import sys\n",
13
    "import numpy\n",
14
    "import sklearn\n",
15
    "from sklearn.model_selection import train_test_split, cross_val_score\n",
16
    "from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, roc_curve"
17
   ]
18
  },
19
  {
20
   "cell_type": "code",
21
   "execution_count": 3,
22
   "metadata": {},
23
   "outputs": [],
24
   "source": [
25
    "import time\n",
26
    "t = time.time()"
27
   ]
28
  },
29
  {
30
   "cell_type": "code",
31
   "execution_count": 4,
32
   "metadata": {},
33
   "outputs": [
34
    {
35
     "data": {
36
      "text/plain": [
37
       "(83,)"
38
      ]
39
     },
40
     "execution_count": 4,
41
     "metadata": {},
42
     "output_type": "execute_result"
43
    }
44
   ],
45
   "source": [
46
    "infile = \"/repos/tables/glom_xml_split.tab\"\n",
47
    "df = pd.read_table(infile, usecols=[\"file_id\", \"split\"])\n",
48
    "# df['png'] = df.file_id.map(lambda x: x+\".png\")\n",
49
    "df = df.set_index(\"file_id\")[\"split\"]\n",
50
    "df.shape"
51
   ]
52
  },
53
  {
54
   "cell_type": "code",
55
   "execution_count": 5,
56
   "metadata": {},
57
   "outputs": [],
58
   "source": [
59
    "# df.set_index('id', inplace=True)\n"
60
   ]
61
  },
62
  {
63
   "cell_type": "code",
64
   "execution_count": 6,
65
   "metadata": {},
66
   "outputs": [],
67
   "source": [
68
    "# indir = \"/repos/data/glom/data_1024/glom_split/all\"\n",
69
    "# indir = \"/repos/data/glom/data_512_subsample_2x/glom_split/all\"\n",
70
    "indir = \"/repos/data/glom/data_256_subsample_4x/glom_split/all\"\n",
71
    "indir = \"/repos/data/glom/data_128_subsample_8x/glom_split/all\"\n",
72
    "outdir = os.path.dirname(indir.rstrip('/'))\n",
73
    "def datagen(indir):\n",
74
    "    for dd in os.scandir(indir):\n",
75
    "        for ff in os.scandir(dd.path):\n",
76
    "#             if os.path.isdir(ff.path) or not (ff.name.endswith(\"png\") or ff.name.endswith(\"json\")):\n",
77
    "            if os.path.isdir(ff.path) or not (ff.name.endswith(\"json\")):\n",
78
    "                continue\n",
79
    "#             print(ff.name.split('-')[0], ff.path)\n",
80
    "            yield (ff.name.split('-')[0], ff)\n",
81
    "    \n",
82
    "def gen_set(indir, outdir, df):\n",
83
    "    for slideid, ff in datagen(indir):\n",
84
    "        posnegset = os.path.basename(os.path.dirname(ff.path))\n",
85
    "        set_ = df.loc[slideid]\n",
86
    "        yield ff.path, os.path.join(outdir, set_, posnegset, ff.name)"
87
   ]
88
  },
89
  {
90
   "cell_type": "code",
91
   "execution_count": 7,
92
   "metadata": {},
93
   "outputs": [
94
    {
95
     "data": {
96
      "text/plain": [
97
       "11710"
98
      ]
99
     },
100
     "execution_count": 7,
101
     "metadata": {},
102
     "output_type": "execute_result"
103
    }
104
   ],
105
   "source": [
106
    "gen = gen_set(indir, outdir, df)\n",
107
    "sum((1 for _ in gen))"
108
   ]
109
  },
110
  {
111
   "cell_type": "code",
112
   "execution_count": 8,
113
   "metadata": {},
114
   "outputs": [
115
    {
116
     "name": "stdout",
117
     "output_type": "stream",
118
     "text": [
119
      "/repos/data/glom/data_128_subsample_8x/glom_split/train/normal\n",
120
      "/repos/data/glom/data_128_subsample_8x/glom_split/train/glom\n",
121
      "/repos/data/glom/data_128_subsample_8x/glom_split/test/normal\n",
122
      "/repos/data/glom/data_128_subsample_8x/glom_split/test/glom\n",
123
      "/repos/data/glom/data_128_subsample_8x/glom_split/val/normal\n",
124
      "/repos/data/glom/data_128_subsample_8x/glom_split/val/glom\n"
125
     ]
126
    }
127
   ],
128
   "source": [
129
    "posnegset = os.listdir(indir)\n",
130
    "for _, set_ in df.drop_duplicates().items():\n",
131
    "    for pn in posnegset:\n",
132
    "        setdir = os.path.join(outdir, set_, pn)\n",
133
    "        print(setdir)\n",
134
    "        os.makedirs(setdir, exist_ok=True)"
135
   ]
136
  },
137
  {
138
   "cell_type": "code",
139
   "execution_count": 9,
140
   "metadata": {},
141
   "outputs": [
142
    {
143
     "name": "stdout",
144
     "output_type": "stream",
145
     "text": [
146
      "0\n",
147
      "1000\n",
148
      "2000\n",
149
      "3000\n",
150
      "4000\n",
151
      "5000\n",
152
      "6000\n",
153
      "7000\n",
154
      "8000\n",
155
      "9000\n",
156
      "10000\n",
157
      "11000\n"
158
     ]
159
    }
160
   ],
161
   "source": [
162
    "gen = gen_set(indir, outdir, df)\n",
163
    "for nn, (ifn, ofn) in enumerate(gen):\n",
164
    "    try:\n",
165
    "        os.symlink(ifn, ofn)\n",
166
    "    except FileExistsError as ee:\n",
167
    "        print(ee)\n",
168
    "        continue\n",
169
    "    if nn % 1000 == 0:\n",
170
    "        print(nn)"
171
   ]
172
  },
173
  {
174
   "cell_type": "code",
175
   "execution_count": null,
176
   "metadata": {},
177
   "outputs": [],
178
   "source": []
179
  }
180
 ],
181
 "metadata": {
182
  "kernelspec": {
183
   "display_name": "Python 3",
184
   "language": "python",
185
   "name": "python3"
186
  },
187
  "language_info": {
188
   "codemirror_mode": {
189
    "name": "ipython",
190
    "version": 3
191
   },
192
   "file_extension": ".py",
193
   "mimetype": "text/x-python",
194
   "name": "python",
195
   "nbconvert_exporter": "python",
196
   "pygments_lexer": "ipython3",
197
   "version": "3.5.2"
198
  }
199
 },
200
 "nbformat": 4,
201
 "nbformat_minor": 2
202
}