[f1e01c]: / data / tract / baseline.ipynb

Download this file

215 lines (214 with data), 7.7 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72552479",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import cv2\n",
    "from PIL import Image\n",
    "import glob\n",
    "from rich import print\n",
    "from ipywidgets import interact\n",
    "from tqdm.auto import tqdm\n",
    "import albumentations as A"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65bea81a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv(\"./train.csv\")\n",
    "df_train = df_train.sort_values([\"id\", \"class\"]).reset_index(drop = True)\n",
    "df_train[\"patient\"] = df_train.id.apply(lambda x: x.split(\"_\")[0])\n",
    "df_train[\"days\"] = df_train.id.apply(lambda x: \"_\".join(x.split(\"_\")[:2]))\n",
    "num_slices = len(np.unique(df_train.id))\n",
    "num_empty_slices = df_train.groupby(\"id\").apply(lambda x: x.segmentation.isna().all()).sum()\n",
    "num_patients = len(np.unique(df_train.patient))\n",
    "num_days = len(np.unique(df_train.days))\n",
    "print({\n",
    "    \"#slices:\": num_slices,\n",
    "    \"#empty slices:\": num_empty_slices,\n",
    "    \"#patients\": num_patients,\n",
    "    \"#days\": num_days\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17b870d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_image_files = sorted(glob.glob(\"./train/*/*/scans/*.png\"), key = lambda x: x.split(\"/\")[3] + \"_\" + x.split(\"/\")[5])\n",
    "size_x = [int(os.path.basename(_)[:-4].split(\"_\")[-4]) for _ in all_image_files]\n",
    "size_y = [int(os.path.basename(_)[:-4].split(\"_\")[-3]) for _ in all_image_files]\n",
    "spacing_x = [float(os.path.basename(_)[:-4].split(\"_\")[-2]) for _ in all_image_files]\n",
    "spacing_y = [float(os.path.basename(_)[:-4].split(\"_\")[-1]) for _ in all_image_files]\n",
    "df_train[\"image_files\"] = np.repeat(all_image_files, 3)\n",
    "df_train[\"spacing_x\"] = np.repeat(spacing_x, 3)\n",
    "df_train[\"spacing_y\"] = np.repeat(spacing_y, 3)\n",
    "df_train[\"size_x\"] = np.repeat(size_x, 3)\n",
    "df_train[\"size_y\"] = np.repeat(size_y, 3)\n",
    "df_train[\"slice\"] = np.repeat([int(os.path.basename(_)[:-4].split(\"_\")[-5]) for _ in all_image_files], 3)\n",
    "df_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c0681bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df_train[[\"size_x\", \"size_y\", \"spacing_x\", \"spacing_y\"]].value_counts())\n",
    "\n",
    "norm = lambda x: ((x - x.min()) / np.ptp(x) * 255).astype(np.uint8)\n",
    "colors = {\"large_bowel\": [127, 0, 0], \"small_bowel\": [0, 127, 0], \"stomach\": [0, 0, 127]}\n",
    "\n",
    "def rle_decode(mask_rle, shape):\n",
    "    s = np.array(mask_rle.split(), dtype=int)\n",
    "    starts, lengths = s[0::2] - 1, s[1::2]\n",
    "    ends = starts + lengths\n",
    "    h, w = shape\n",
    "    img = np.zeros((h * w,), dtype = np.uint8)\n",
    "    for lo, hi in zip(starts, ends):\n",
    "        img[lo : hi] = 1\n",
    "    return img.reshape(shape)\n",
    "\n",
    "def rle_encode(img):\n",
    "    pixels = img.flatten()\n",
    "    pixels = np.concatenate([[0], pixels, [0]])\n",
    "    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1\n",
    "    runs[1::2] -= runs[::2]\n",
    "    return ' '.join(str(x) for x in runs)\n",
    "\n",
    "def display_pre(file_name):\n",
    "    img = cv2.imread(file_name, cv2.IMREAD_ANYDEPTH)\n",
    "    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)\n",
    "    img = norm(img)\n",
    "\n",
    "    img_ = img.copy()\n",
    "    segms = df_train.loc[df_train.image_files == file_name]\n",
    "    for segm, label in zip(segms.segmentation, segms[\"class\"]):\n",
    "        if not pd.isna(segm):\n",
    "            mask = rle_decode(segm, img.shape[:2])\n",
    "            img_[mask == 1] = img_[mask == 1] // 2 + colors[label]\n",
    "    img = np.concatenate([img, np.ones((img.shape[0], 10, 3), dtype = np.uint8) * 255, img_], 1)\n",
    "    return img\n",
    "\n",
    "for info, group in df_train.groupby([\"size_x\", \"size_y\", \"spacing_x\", \"spacing_y\"]):\n",
    "    print(info)\n",
    "    file_name = np.random.choice(group.loc[group.slice == 70, \"image_files\"])\n",
    "    img = display_pre(file_name)\n",
    "    display(Image.fromarray(img))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b94b98e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def show(idx):\n",
    "    file_name = df_train.loc[idx, \"image_files\"]\n",
    "    img = display_pre(file_name)\n",
    "    display(Image.fromarray(img))\n",
    "\n",
    "@interact\n",
    "def f(idx = (0, len(df_train) - 1, 3)):\n",
    "    show(idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87cb5f53",
   "metadata": {},
   "outputs": [],
   "source": [
    "for day, group in tqdm(df_train.groupby(\"days\")):\n",
    "    patient = group.patient.iloc[0]\n",
    "    imgs = []\n",
    "    msks = []\n",
    "    for file_name in tqdm(group.image_files.unique(), leave = False):\n",
    "        img = cv2.imread(file_name, cv2.IMREAD_ANYDEPTH)\n",
    "        segms = group.loc[group.image_files == file_name]\n",
    "        masks = {}\n",
    "        for segm, label in zip(segms.segmentation, segms[\"class\"]):\n",
    "            if not pd.isna(segm):\n",
    "                mask = rle_decode(segm, img.shape[:2])\n",
    "                masks[label] = mask\n",
    "            else:\n",
    "                masks[label] = np.zeros(img.shape[:2], dtype = np.uint8)\n",
    "        masks = np.stack([masks[k] for k in sorted(masks)], -1)\n",
    "        imgs.append(img)\n",
    "        msks.append(masks)\n",
    "        \n",
    "    imgs = np.stack(imgs, 0)\n",
    "    msks = np.stack(msks, 0)\n",
    "    for i in range(msks.shape[0]):\n",
    "        img = imgs[i]\n",
    "        msk = msks[i]\n",
    "        new_image_name = f\"{day}_{i}.png\"\n",
    "        cv2.imwrite(f\"./mmseg_train/images/{new_image_name}\", img)\n",
    "        cv2.imwrite(f\"./mmseg_train/labels/{new_image_name}\", msk)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3a679e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_image_files = glob.glob(\"./mmseg_train/images/*\")\n",
    "patients = [os.path.basename(_).split(\"_\")[0] for _ in all_image_files]\n",
    "\n",
    "\n",
    "from sklearn.model_selection import GroupKFold\n",
    "\n",
    "split = list(GroupKFold(5).split(patients, groups = patients))\n",
    "\n",
    "for fold, (train_idx, valid_idx) in enumerate(split):\n",
    "    with open(f\"./mmseg_train/splits/fold_{fold}.txt\", \"w\") as f:\n",
    "        for idx in train_idx:\n",
    "            f.write(os.path.basename(all_image_files[idx])[:-4] + \"\\n\")\n",
    "    with open(f\"./mmseg_train/splits/holdout_{fold}.txt\", \"w\") as f:\n",
    "        for idx in valid_idx:\n",
    "            f.write(os.path.basename(all_image_files[idx])[:-4] + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}