215 lines (214 with data), 7.7 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "72552479",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import cv2\n",
"from PIL import Image\n",
"import glob\n",
"from rich import print\n",
"from ipywidgets import interact\n",
"from tqdm.auto import tqdm\n",
"import albumentations as A"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "65bea81a",
"metadata": {},
"outputs": [],
"source": [
"df_train = pd.read_csv(\"./train.csv\")\n",
"df_train = df_train.sort_values([\"id\", \"class\"]).reset_index(drop = True)\n",
"df_train[\"patient\"] = df_train.id.apply(lambda x: x.split(\"_\")[0])\n",
"df_train[\"days\"] = df_train.id.apply(lambda x: \"_\".join(x.split(\"_\")[:2]))\n",
"num_slices = len(np.unique(df_train.id))\n",
"num_empty_slices = df_train.groupby(\"id\").apply(lambda x: x.segmentation.isna().all()).sum()\n",
"num_patients = len(np.unique(df_train.patient))\n",
"num_days = len(np.unique(df_train.days))\n",
"print({\n",
" \"#slices:\": num_slices,\n",
" \"#empty slices:\": num_empty_slices,\n",
" \"#patients\": num_patients,\n",
" \"#days\": num_days\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "17b870d2",
"metadata": {},
"outputs": [],
"source": [
"all_image_files = sorted(glob.glob(\"./train/*/*/scans/*.png\"), key = lambda x: x.split(\"/\")[3] + \"_\" + x.split(\"/\")[5])\n",
"size_x = [int(os.path.basename(_)[:-4].split(\"_\")[-4]) for _ in all_image_files]\n",
"size_y = [int(os.path.basename(_)[:-4].split(\"_\")[-3]) for _ in all_image_files]\n",
"spacing_x = [float(os.path.basename(_)[:-4].split(\"_\")[-2]) for _ in all_image_files]\n",
"spacing_y = [float(os.path.basename(_)[:-4].split(\"_\")[-1]) for _ in all_image_files]\n",
"df_train[\"image_files\"] = np.repeat(all_image_files, 3)\n",
"df_train[\"spacing_x\"] = np.repeat(spacing_x, 3)\n",
"df_train[\"spacing_y\"] = np.repeat(spacing_y, 3)\n",
"df_train[\"size_x\"] = np.repeat(size_x, 3)\n",
"df_train[\"size_y\"] = np.repeat(size_y, 3)\n",
"df_train[\"slice\"] = np.repeat([int(os.path.basename(_)[:-4].split(\"_\")[-5]) for _ in all_image_files], 3)\n",
"df_train"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c0681bc",
"metadata": {},
"outputs": [],
"source": [
"print(df_train[[\"size_x\", \"size_y\", \"spacing_x\", \"spacing_y\"]].value_counts())\n",
"\n",
"norm = lambda x: ((x - x.min()) / np.ptp(x) * 255).astype(np.uint8)\n",
"colors = {\"large_bowel\": [127, 0, 0], \"small_bowel\": [0, 127, 0], \"stomach\": [0, 0, 127]}\n",
"\n",
"def rle_decode(mask_rle, shape):\n",
" s = np.array(mask_rle.split(), dtype=int)\n",
" starts, lengths = s[0::2] - 1, s[1::2]\n",
" ends = starts + lengths\n",
" h, w = shape\n",
" img = np.zeros((h * w,), dtype = np.uint8)\n",
" for lo, hi in zip(starts, ends):\n",
" img[lo : hi] = 1\n",
" return img.reshape(shape)\n",
"\n",
"def rle_encode(img):\n",
" pixels = img.flatten()\n",
" pixels = np.concatenate([[0], pixels, [0]])\n",
" runs = np.where(pixels[1:] != pixels[:-1])[0] + 1\n",
" runs[1::2] -= runs[::2]\n",
" return ' '.join(str(x) for x in runs)\n",
"\n",
"def display_pre(file_name):\n",
" img = cv2.imread(file_name, cv2.IMREAD_ANYDEPTH)\n",
" img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
" img = norm(img)\n",
"\n",
" img_ = img.copy()\n",
" segms = df_train.loc[df_train.image_files == file_name]\n",
" for segm, label in zip(segms.segmentation, segms[\"class\"]):\n",
" if not pd.isna(segm):\n",
" mask = rle_decode(segm, img.shape[:2])\n",
" img_[mask == 1] = img_[mask == 1] // 2 + colors[label]\n",
" img = np.concatenate([img, np.ones((img.shape[0], 10, 3), dtype = np.uint8) * 255, img_], 1)\n",
" return img\n",
"\n",
"for info, group in df_train.groupby([\"size_x\", \"size_y\", \"spacing_x\", \"spacing_y\"]):\n",
" print(info)\n",
" file_name = np.random.choice(group.loc[group.slice == 70, \"image_files\"])\n",
" img = display_pre(file_name)\n",
" display(Image.fromarray(img))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b94b98e4",
"metadata": {},
"outputs": [],
"source": [
"def show(idx):\n",
" file_name = df_train.loc[idx, \"image_files\"]\n",
" img = display_pre(file_name)\n",
" display(Image.fromarray(img))\n",
"\n",
"@interact\n",
"def f(idx = (0, len(df_train) - 1, 3)):\n",
" show(idx)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87cb5f53",
"metadata": {},
"outputs": [],
"source": [
"for day, group in tqdm(df_train.groupby(\"days\")):\n",
" patient = group.patient.iloc[0]\n",
" imgs = []\n",
" msks = []\n",
" for file_name in tqdm(group.image_files.unique(), leave = False):\n",
" img = cv2.imread(file_name, cv2.IMREAD_ANYDEPTH)\n",
" segms = group.loc[group.image_files == file_name]\n",
" masks = {}\n",
" for segm, label in zip(segms.segmentation, segms[\"class\"]):\n",
" if not pd.isna(segm):\n",
" mask = rle_decode(segm, img.shape[:2])\n",
" masks[label] = mask\n",
" else:\n",
" masks[label] = np.zeros(img.shape[:2], dtype = np.uint8)\n",
" masks = np.stack([masks[k] for k in sorted(masks)], -1)\n",
" imgs.append(img)\n",
" msks.append(masks)\n",
" \n",
" imgs = np.stack(imgs, 0)\n",
" msks = np.stack(msks, 0)\n",
" for i in range(msks.shape[0]):\n",
" img = imgs[i]\n",
" msk = msks[i]\n",
" new_image_name = f\"{day}_{i}.png\"\n",
" cv2.imwrite(f\"./mmseg_train/images/{new_image_name}\", img)\n",
" cv2.imwrite(f\"./mmseg_train/labels/{new_image_name}\", msk)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3a679e4",
"metadata": {},
"outputs": [],
"source": [
"all_image_files = glob.glob(\"./mmseg_train/images/*\")\n",
"patients = [os.path.basename(_).split(\"_\")[0] for _ in all_image_files]\n",
"\n",
"\n",
"from sklearn.model_selection import GroupKFold\n",
"\n",
"split = list(GroupKFold(5).split(patients, groups = patients))\n",
"\n",
"for fold, (train_idx, valid_idx) in enumerate(split):\n",
" with open(f\"./mmseg_train/splits/fold_{fold}.txt\", \"w\") as f:\n",
" for idx in train_idx:\n",
" f.write(os.path.basename(all_image_files[idx])[:-4] + \"\\n\")\n",
" with open(f\"./mmseg_train/splits/holdout_{fold}.txt\", \"w\") as f:\n",
" for idx in valid_idx:\n",
" f.write(os.path.basename(all_image_files[idx])[:-4] + \"\\n\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}