203 lines (202 with data), 5.0 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"import shutil\n",
"import sys\n",
"import numpy\n",
"import sklearn\n",
"from sklearn.model_selection import train_test_split, cross_val_score\n",
"from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, roc_curve"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"t = time.time()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(83,)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"infile = \"/repos/tables/glom_xml_split.tab\"\n",
"df = pd.read_table(infile, usecols=[\"file_id\", \"split\"])\n",
"# df['png'] = df.file_id.map(lambda x: x+\".png\")\n",
"df = df.set_index(\"file_id\")[\"split\"]\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# df.set_index('id', inplace=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# indir = \"/repos/data/glom/data_1024/glom_split/all\"\n",
"# indir = \"/repos/data/glom/data_512_subsample_2x/glom_split/all\"\n",
"indir = \"/repos/data/glom/data_256_subsample_4x/glom_split/all\"\n",
"indir = \"/repos/data/glom/data_128_subsample_8x/glom_split/all\"\n",
"outdir = os.path.dirname(indir.rstrip('/'))\n",
"def datagen(indir):\n",
" for dd in os.scandir(indir):\n",
" for ff in os.scandir(dd.path):\n",
"# if os.path.isdir(ff.path) or not (ff.name.endswith(\"png\") or ff.name.endswith(\"json\")):\n",
" if os.path.isdir(ff.path) or not (ff.name.endswith(\"json\")):\n",
" continue\n",
"# print(ff.name.split('-')[0], ff.path)\n",
" yield (ff.name.split('-')[0], ff)\n",
" \n",
"def gen_set(indir, outdir, df):\n",
" for slideid, ff in datagen(indir):\n",
" posnegset = os.path.basename(os.path.dirname(ff.path))\n",
" set_ = df.loc[slideid]\n",
" yield ff.path, os.path.join(outdir, set_, posnegset, ff.name)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"11710"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gen = gen_set(indir, outdir, df)\n",
"sum((1 for _ in gen))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/repos/data/glom/data_128_subsample_8x/glom_split/train/normal\n",
"/repos/data/glom/data_128_subsample_8x/glom_split/train/glom\n",
"/repos/data/glom/data_128_subsample_8x/glom_split/test/normal\n",
"/repos/data/glom/data_128_subsample_8x/glom_split/test/glom\n",
"/repos/data/glom/data_128_subsample_8x/glom_split/val/normal\n",
"/repos/data/glom/data_128_subsample_8x/glom_split/val/glom\n"
]
}
],
"source": [
"posnegset = os.listdir(indir)\n",
"for _, set_ in df.drop_duplicates().items():\n",
" for pn in posnegset:\n",
" setdir = os.path.join(outdir, set_, pn)\n",
" print(setdir)\n",
" os.makedirs(setdir, exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"1000\n",
"2000\n",
"3000\n",
"4000\n",
"5000\n",
"6000\n",
"7000\n",
"8000\n",
"9000\n",
"10000\n",
"11000\n"
]
}
],
"source": [
"gen = gen_set(indir, outdir, df)\n",
"for nn, (ifn, ofn) in enumerate(gen):\n",
" try:\n",
" os.symlink(ifn, ofn)\n",
" except FileExistsError as ee:\n",
" print(ee)\n",
" continue\n",
" if nn % 1000 == 0:\n",
" print(nn)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}