572 lines (572 with data), 20.7 kB
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"machine_shape": "hm",
"gpuType": "A100",
"authorship_tag": "ABX9TyPO4RCR90x1ulCT1NL/OQC2",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/cwinsor/medical_image_uw_madison/blob/main/colab_02_train.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"import sys\n",
"import os"
],
"metadata": {
"id": "kxzfyBT42MJu"
},
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "PzixNp9Sr_BQ"
},
"outputs": [],
"source": [
"assert ('google.colab' in sys.modules), \"ERROR - the script expects to be run in Colab\""
]
},
{
"cell_type": "code",
"source": [
"import torch\n",
"torch.cuda.is_available()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "yaDETX2Svhx3",
"outputId": "4b77ed08-f201-40a6-f560-9178b3ac81e7"
},
"execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"execution_count": 3
}
]
},
{
"cell_type": "code",
"source": [
"# we persist code, dataset and runs on google drive...\n",
"from google.colab import drive\n",
"drive.mount('/content/gdrive')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "kyNdABfE1ut3",
"outputId": "83705a59-1b3f-4ac0-e0ea-00fece31292e"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Mounted at /content/gdrive\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# we probably don't want to do a \"git fetch\" but a \"--dry-run\" may be helpful to check if there's repo updates\n",
"# versions are already set from the prior script colab_01 ...\"\n",
"# so even if they are out-of-date we wouldn't want to update them here, we'd go\n",
"# back to the earlier script...\n",
"\n",
"work_folder = '/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/'\n",
"os.chdir(work_folder)\n",
"print(\"checking git status (git fetch --dry-run):\")\n",
"!pwd\n",
"!git fetch --dry-run"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MwCx0MLb2a5w",
"outputId": "b6b73b7c-e4f2-4206-f741-24ff1e0df7b6"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"checking git status (git fetch --dry-run):\n",
"/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison\n",
"From https://github.com/cwinsor/medical_image_uw_madison\n",
" b19876f..c3ca8f4 main -> origin/main\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# Install dependencies"
],
"metadata": {
"id": "BmGKp3qJXpSK"
}
},
{
"cell_type": "code",
"source": [
"!apt update\n",
"# !apt install python3-venv python3-pip"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MMGJ7zfCbieS",
"outputId": "5cbe455d-4a86-4415-d8fa-2112fb46b0fb"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease\n",
"Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]\n",
"Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease\n",
"Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]\n",
"Hit:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease\n",
"Get:6 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [731 kB]\n",
"Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [108 kB]\n",
"Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease\n",
"Get:9 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [734 kB]\n",
"Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n",
"Fetched 1,802 kB in 3s (711 kB/s)\n",
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"8 packages can be upgraded. Run 'apt list --upgradable' to see them.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"pip --version"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "dsC-cl5cYL-R",
"outputId": "8d05fae4-ebe2-4ca5-9ede-50d9461a688b"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!pip --version"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "c4y3IzR_p-MY",
"outputId": "6fca2792-46d0-4f27-931f-6d329144232a"
},
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"run_folder = '/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UWMGIT'\n",
"os.chdir(run_folder)\n",
"!pwd"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ennX8bLUPthE",
"outputId": "76e9cad1-8028-430a-f844-f4b89766f7ea"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UWMGIT\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!pip list > tempfile3a.txt"
],
"metadata": {
"id": "ZSJkGJdcc1A7"
},
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!pip install -r ../requirements_cw1b.txt # NOTE this is \"cw\" requirements one level up"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "kkEeD-HvXVvA",
"outputId": "658a8590-4dbd-4123-febb-8c5328ece21f"
},
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting mmcv==1.3.1 (from -r ../requirements_cw1b.txt (line 1))\n",
" Using cached mmcv-1.3.1-py2.py3-none-any.whl\n",
"Collecting mmcv-full==1.3.1 (from -r ../requirements_cw1b.txt (line 2))\n",
" Using cached mmcv-full-1.3.1.tar.gz (259 kB)\n",
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Collecting addict (from mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1))\n",
" Using cached addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1)) (1.22.4)\n",
"Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1)) (8.4.0)\n",
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1)) (6.0.1)\n",
"Collecting yapf (from mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1))\n",
" Using cached yapf-0.40.1-py3-none-any.whl (250 kB)\n",
"Collecting importlib-metadata>=6.6.0 (from yapf->mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1))\n",
" Using cached importlib_metadata-6.8.0-py3-none-any.whl (22 kB)\n",
"Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1)) (3.9.1)\n",
"Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1)) (2.0.1)\n",
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1)) (3.16.2)\n",
"Building wheels for collected packages: mmcv-full\n",
" \u001b[1;31merror\u001b[0m: \u001b[1msubprocess-exited-with-error\u001b[0m\n",
" \n",
" \u001b[31m×\u001b[0m \u001b[32mpython setup.py bdist_wheel\u001b[0m did not run successfully.\n",
" \u001b[31m│\u001b[0m exit code: \u001b[1;36m1\u001b[0m\n",
" \u001b[31m╰─>\u001b[0m See above for output.\n",
" \n",
" \u001b[1;35mnote\u001b[0m: This error originates from a subprocess, and is likely not a problem with pip.\n",
" Building wheel for mmcv-full (setup.py) ... \u001b[?25lerror\n",
"\u001b[31m ERROR: Failed building wheel for mmcv-full\u001b[0m\u001b[31m\n",
"\u001b[0m\u001b[?25h Running setup.py clean for mmcv-full\n",
"Failed to build mmcv-full\n",
"\u001b[31mERROR: Could not build wheels for mmcv-full, which is required to install pyproject.toml-based projects\u001b[0m\u001b[31m\n",
"\u001b[0m"
]
}
]
},
{
"cell_type": "code",
"source": [
"!pip list > tempfile3b.txt"
],
"metadata": {
"id": "j-LHhjEEdBj7"
},
"execution_count": 12,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Add to PATH (if required)"
],
"metadata": {
"id": "jHGYYc_kWGZd"
}
},
{
"cell_type": "markdown",
"source": [
"## Comparing ***os.getenv('PATH')*** to ***sys.path***\n",
"* the former is path for the environment shell (applications)\n",
"* the latter is the python search path for modules. Importantly it is constructed when Python *IS STARTED* and is:\n",
" * the folder of the python interpreter\n",
" * installation-dependent standard folders\n",
" * the contents of PYTHONPATH\n",
" \n",
"in other words... updating PYTHONPATH won't help inside a jupyter notebook!\n"
],
"metadata": {
"id": "U-cUpb3JgiiG"
}
},
{
"cell_type": "code",
"source": [
"THE_FOLDER = \"foo\"\n",
"# THE_FOLDER = os.getcwd()\n",
"\n",
"if THE_FOLDER in sys.path:\n",
" print(\"already exists\")\n",
"else:\n",
" sys.path.append(THE_FOLDER)\n",
" print(\"added\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VuvTVyhISc2E",
"outputId": "3dbf9c20-cc39-46e7-9971-45d1e8eac458"
},
"execution_count": 29,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"already exists\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# PyTorch Distributed...\n",
"\n",
"https://pytorch.org/docs/stable/distributed.html\n",
"\n",
" How to use this module:\n",
"\n",
" Single-Node multi-process distributed training\n",
"\n",
" python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE\n",
" YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other\n",
" arguments of your training script)\n",
"\n",
" Multi-Node multi-process distributed training: (e.g. two nodes)\n",
"\n",
" Node 1: (IP: 192.168.1.1, and has a free port: 1234)\n",
" python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE\n",
" --nnodes=2 --node-rank=0 --master-addr=\"192.168.1.1\"\n",
" --master-port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3\n",
" and all other arguments of your training script)\n",
"\n",
" Node 2:\n",
" python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE\n",
" --nnodes=2 --node-rank=1 --master-addr=\"192.168.1.1\"\n",
" --master-port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3\n",
" and all other arguments of your training script)\n"
],
"metadata": {
"id": "T56srWVM7CGk"
}
},
{
"cell_type": "markdown",
"source": [
"# Ours is single-node multi-GPU.\n",
"So the shell command would be:\n",
"\n",
" cd /content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UMWGIT\n",
"\n",
" CONFIG='work_configs/tract/final_solution/classification_configs/cls_1.py'\n",
" NUM_GPUS_PER_NODE=2\n",
" MASTER_PORT=1234\n",
" ARGS='foo=\"bar\" lala=\"foofoo\"'\n",
"\n",
" python -m torch.distributed.launch --nproc-per-node=$NUM_GPUS_PER_NODE\n",
" tools/train.py $CONFIG --launcher pytorch $ARGS\n",
" "
],
"metadata": {
"id": "i7gS7KopA5h3"
}
},
{
"cell_type": "markdown",
"source": [
"# let's do that now..."
],
"metadata": {
"id": "enb2V0FeGHb9"
}
},
{
"cell_type": "code",
"source": [
"run_folder = '/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UWMGIT'\n",
"os.chdir(run_folder)\n",
"!pwd"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "NqdJQ1P3GJ9o",
"outputId": "4318969f-143a-4182-cd11-8ec5a7c3e518"
},
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UWMGIT\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"CONFIG='work_configs/tract/final_solution/classification_configs/cls_1.py'\n",
"NUM_GPUS_PER_NODE=1\n",
"MASTER_PORT=1234\n",
"ARGS='foo=\"bar\" lala=\"foofoo\"'"
],
"metadata": {
"id": "fGPgtudrJrhl"
},
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!python -m torch.distributed.launch --nproc-per-node=$NUM_GPUS_PER_NODE tools/train.py $CONFIG --launcher pytorch $ARGS"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VcmbJhb5KssT",
"outputId": "a125ee0a-367e-4e54-aa06-75fa1acb4315"
},
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"/usr/local/lib/python3.10/dist-packages/torch/distributed/launch.py:181: FutureWarning: The module torch.distributed.launch is deprecated\n",
"and will be removed in future. Use torchrun.\n",
"Note that --use-env is set by default in torchrun.\n",
"If your script expects `--local-rank` argument to be set, please\n",
"change it to read from `os.environ['LOCAL_RANK']` instead. See \n",
"https://pytorch.org/docs/stable/distributed.html#launch-utility for \n",
"further instructions\n",
"\n",
" warnings.warn(\n",
"No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n",
"Traceback (most recent call last):\n",
" File \"/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UWMGIT/tools/train.py\", line 15, in <module>\n",
" from mmseg import __version__\n",
"ModuleNotFoundError: No module named 'mmseg'\n",
"ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 9800) of binary: /usr/bin/python3\n",
"Traceback (most recent call last):\n",
" File \"/usr/lib/python3.10/runpy.py\", line 196, in _run_module_as_main\n",
" return _run_code(code, main_globals, None,\n",
" File \"/usr/lib/python3.10/runpy.py\", line 86, in _run_code\n",
" exec(code, run_globals)\n",
" File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/launch.py\", line 196, in <module>\n",
" main()\n",
" File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/launch.py\", line 192, in main\n",
" launch(args)\n",
" File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/launch.py\", line 177, in launch\n",
" run(args)\n",
" File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py\", line 785, in run\n",
" elastic_launch(\n",
" File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py\", line 134, in __call__\n",
" return launch_agent(self._config, self._entrypoint, list(args))\n",
" File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py\", line 250, in launch_agent\n",
" raise ChildFailedError(\n",
"torch.distributed.elastic.multiprocessing.errors.ChildFailedError: \n",
"============================================================\n",
"tools/train.py FAILED\n",
"------------------------------------------------------------\n",
"Failures:\n",
" <NO_OTHER_FAILURES>\n",
"------------------------------------------------------------\n",
"Root Cause (first observed failure):\n",
"[0]:\n",
" time : 2023-07-21_22:27:53\n",
" host : f6a49488322e\n",
" rank : 0 (local_rank: 0)\n",
" exitcode : 1 (pid: 9800)\n",
" error_file: <N/A>\n",
" traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html\n",
"============================================================\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "BGLApDEGK_v3"
},
"execution_count": null,
"outputs": []
}
]
}