[44382a]: / colab_02_train.ipynb

Download this file

572 lines (572 with data), 20.7 kB

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "machine_shape": "hm",
      "gpuType": "A100",
      "authorship_tag": "ABX9TyPO4RCR90x1ulCT1NL/OQC2",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/cwinsor/medical_image_uw_madison/blob/main/colab_02_train.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import sys\n",
        "import os"
      ],
      "metadata": {
        "id": "kxzfyBT42MJu"
      },
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "PzixNp9Sr_BQ"
      },
      "outputs": [],
      "source": [
        "assert ('google.colab' in sys.modules), \"ERROR - the script expects to be run in Colab\""
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "torch.cuda.is_available()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "yaDETX2Svhx3",
        "outputId": "4b77ed08-f201-40a6-f560-9178b3ac81e7"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {},
          "execution_count": 3
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# we persist code, dataset and runs on google drive...\n",
        "from google.colab import drive\n",
        "drive.mount('/content/gdrive')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "kyNdABfE1ut3",
        "outputId": "83705a59-1b3f-4ac0-e0ea-00fece31292e"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/gdrive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# we probably don't want to do a \"git fetch\" but a \"--dry-run\" may be helpful to check if there's repo updates\n",
        "# versions are already set from the prior script colab_01 ...\"\n",
        "# so even if they are out-of-date we wouldn't want to update them here, we'd go\n",
        "# back to the earlier script...\n",
        "\n",
        "work_folder = '/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/'\n",
        "os.chdir(work_folder)\n",
        "print(\"checking git status (git fetch --dry-run):\")\n",
        "!pwd\n",
        "!git fetch --dry-run"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MwCx0MLb2a5w",
        "outputId": "b6b73b7c-e4f2-4206-f741-24ff1e0df7b6"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "checking git status (git fetch --dry-run):\n",
            "/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison\n",
            "From https://github.com/cwinsor/medical_image_uw_madison\n",
            "   b19876f..c3ca8f4  main       -> origin/main\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Install dependencies"
      ],
      "metadata": {
        "id": "BmGKp3qJXpSK"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!apt update\n",
        "# !apt install python3-venv python3-pip"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MMGJ7zfCbieS",
        "outputId": "5cbe455d-4a86-4415-d8fa-2112fb46b0fb"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease\n",
            "Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]\n",
            "Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease\n",
            "Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]\n",
            "Hit:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease\n",
            "Get:6 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [731 kB]\n",
            "Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [108 kB]\n",
            "Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease\n",
            "Get:9 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [734 kB]\n",
            "Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n",
            "Fetched 1,802 kB in 3s (711 kB/s)\n",
            "Reading package lists... Done\n",
            "Building dependency tree... Done\n",
            "Reading state information... Done\n",
            "8 packages can be upgraded. Run 'apt list --upgradable' to see them.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "pip --version"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dsC-cl5cYL-R",
        "outputId": "8d05fae4-ebe2-4ca5-9ede-50d9461a688b"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip --version"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "c4y3IzR_p-MY",
        "outputId": "6fca2792-46d0-4f27-931f-6d329144232a"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "pip 23.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "run_folder = '/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UWMGIT'\n",
        "os.chdir(run_folder)\n",
        "!pwd"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ennX8bLUPthE",
        "outputId": "76e9cad1-8028-430a-f844-f4b89766f7ea"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UWMGIT\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip list > tempfile3a.txt"
      ],
      "metadata": {
        "id": "ZSJkGJdcc1A7"
      },
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install -r ../requirements_cw1b.txt  # NOTE this is \"cw\" requirements one level up"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "kkEeD-HvXVvA",
        "outputId": "658a8590-4dbd-4123-febb-8c5328ece21f"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting mmcv==1.3.1 (from -r ../requirements_cw1b.txt (line 1))\n",
            "  Using cached mmcv-1.3.1-py2.py3-none-any.whl\n",
            "Collecting mmcv-full==1.3.1 (from -r ../requirements_cw1b.txt (line 2))\n",
            "  Using cached mmcv-full-1.3.1.tar.gz (259 kB)\n",
            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "Collecting addict (from mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1))\n",
            "  Using cached addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1)) (1.22.4)\n",
            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1)) (8.4.0)\n",
            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1)) (6.0.1)\n",
            "Collecting yapf (from mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1))\n",
            "  Using cached yapf-0.40.1-py3-none-any.whl (250 kB)\n",
            "Collecting importlib-metadata>=6.6.0 (from yapf->mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1))\n",
            "  Using cached importlib_metadata-6.8.0-py3-none-any.whl (22 kB)\n",
            "Requirement already satisfied: platformdirs>=3.5.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1)) (3.9.1)\n",
            "Requirement already satisfied: tomli>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from yapf->mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1)) (2.0.1)\n",
            "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata>=6.6.0->yapf->mmcv==1.3.1->-r ../requirements_cw1b.txt (line 1)) (3.16.2)\n",
            "Building wheels for collected packages: mmcv-full\n",
            "  \u001b[1;31merror\u001b[0m: \u001b[1msubprocess-exited-with-error\u001b[0m\n",
            "  \n",
            "  \u001b[31m×\u001b[0m \u001b[32mpython setup.py bdist_wheel\u001b[0m did not run successfully.\n",
            "  \u001b[31m│\u001b[0m exit code: \u001b[1;36m1\u001b[0m\n",
            "  \u001b[31m╰─>\u001b[0m See above for output.\n",
            "  \n",
            "  \u001b[1;35mnote\u001b[0m: This error originates from a subprocess, and is likely not a problem with pip.\n",
            "  Building wheel for mmcv-full (setup.py) ... \u001b[?25lerror\n",
            "\u001b[31m  ERROR: Failed building wheel for mmcv-full\u001b[0m\u001b[31m\n",
            "\u001b[0m\u001b[?25h  Running setup.py clean for mmcv-full\n",
            "Failed to build mmcv-full\n",
            "\u001b[31mERROR: Could not build wheels for mmcv-full, which is required to install pyproject.toml-based projects\u001b[0m\u001b[31m\n",
            "\u001b[0m"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip list > tempfile3b.txt"
      ],
      "metadata": {
        "id": "j-LHhjEEdBj7"
      },
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Add to PATH (if required)"
      ],
      "metadata": {
        "id": "jHGYYc_kWGZd"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Comparing ***os.getenv('PATH')*** to ***sys.path***\n",
        "* the former is path for the environment shell (applications)\n",
        "* the latter is the python search path for modules. Importantly it is constructed when Python *IS STARTED* and is:\n",
        "  * the folder of the python interpreter\n",
        "  * installation-dependent standard folders\n",
        "  * the contents of PYTHONPATH\n",
        "  \n",
        "in other words... updating PYTHONPATH won't help inside a jupyter notebook!\n"
      ],
      "metadata": {
        "id": "U-cUpb3JgiiG"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "THE_FOLDER = \"foo\"\n",
        "# THE_FOLDER = os.getcwd()\n",
        "\n",
        "if THE_FOLDER in sys.path:\n",
        "    print(\"already exists\")\n",
        "else:\n",
        "    sys.path.append(THE_FOLDER)\n",
        "    print(\"added\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "VuvTVyhISc2E",
        "outputId": "3dbf9c20-cc39-46e7-9971-45d1e8eac458"
      },
      "execution_count": 29,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "already exists\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# PyTorch Distributed...\n",
        "\n",
        "https://pytorch.org/docs/stable/distributed.html\n",
        "\n",
        "    How to use this module:\n",
        "\n",
        "    Single-Node multi-process distributed training\n",
        "\n",
        "    python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE\n",
        "            YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other\n",
        "            arguments of your training script)\n",
        "\n",
        "    Multi-Node multi-process distributed training: (e.g. two nodes)\n",
        "\n",
        "    Node 1: (IP: 192.168.1.1, and has a free port: 1234)\n",
        "    python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE\n",
        "            --nnodes=2 --node-rank=0 --master-addr=\"192.168.1.1\"\n",
        "            --master-port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3\n",
        "            and all other arguments of your training script)\n",
        "\n",
        "    Node 2:\n",
        "    python -m torch.distributed.launch --nproc-per-node=NUM_GPUS_YOU_HAVE\n",
        "            --nnodes=2 --node-rank=1 --master-addr=\"192.168.1.1\"\n",
        "            --master-port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3\n",
        "            and all other arguments of your training script)\n"
      ],
      "metadata": {
        "id": "T56srWVM7CGk"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Ours is single-node multi-GPU.\n",
        "So the shell command would be:\n",
        "\n",
        "    cd /content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UMWGIT\n",
        "\n",
        "    CONFIG='work_configs/tract/final_solution/classification_configs/cls_1.py'\n",
        "    NUM_GPUS_PER_NODE=2\n",
        "    MASTER_PORT=1234\n",
        "    ARGS='foo=\"bar\" lala=\"foofoo\"'\n",
        "\n",
        "    python -m torch.distributed.launch --nproc-per-node=$NUM_GPUS_PER_NODE\n",
        "            tools/train.py $CONFIG --launcher pytorch $ARGS\n",
        "            "
      ],
      "metadata": {
        "id": "i7gS7KopA5h3"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "# let's do that now..."
      ],
      "metadata": {
        "id": "enb2V0FeGHb9"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "run_folder = '/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UWMGIT'\n",
        "os.chdir(run_folder)\n",
        "!pwd"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "NqdJQ1P3GJ9o",
        "outputId": "4318969f-143a-4182-cd11-8ec5a7c3e518"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UWMGIT\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "CONFIG='work_configs/tract/final_solution/classification_configs/cls_1.py'\n",
        "NUM_GPUS_PER_NODE=1\n",
        "MASTER_PORT=1234\n",
        "ARGS='foo=\"bar\" lala=\"foofoo\"'"
      ],
      "metadata": {
        "id": "fGPgtudrJrhl"
      },
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!python -m torch.distributed.launch --nproc-per-node=$NUM_GPUS_PER_NODE tools/train.py $CONFIG --launcher pytorch $ARGS"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "VcmbJhb5KssT",
        "outputId": "a125ee0a-367e-4e54-aa06-75fa1acb4315"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/torch/distributed/launch.py:181: FutureWarning: The module torch.distributed.launch is deprecated\n",
            "and will be removed in future. Use torchrun.\n",
            "Note that --use-env is set by default in torchrun.\n",
            "If your script expects `--local-rank` argument to be set, please\n",
            "change it to read from `os.environ['LOCAL_RANK']` instead. See \n",
            "https://pytorch.org/docs/stable/distributed.html#launch-utility for \n",
            "further instructions\n",
            "\n",
            "  warnings.warn(\n",
            "No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n",
            "Traceback (most recent call last):\n",
            "  File \"/content/gdrive/MyDrive/Colab_UW_Madison/medical_image_uw_madison/project/Kaggle-UWMGIT/tools/train.py\", line 15, in <module>\n",
            "    from mmseg import __version__\n",
            "ModuleNotFoundError: No module named 'mmseg'\n",
            "ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 9800) of binary: /usr/bin/python3\n",
            "Traceback (most recent call last):\n",
            "  File \"/usr/lib/python3.10/runpy.py\", line 196, in _run_module_as_main\n",
            "    return _run_code(code, main_globals, None,\n",
            "  File \"/usr/lib/python3.10/runpy.py\", line 86, in _run_code\n",
            "    exec(code, run_globals)\n",
            "  File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/launch.py\", line 196, in <module>\n",
            "    main()\n",
            "  File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/launch.py\", line 192, in main\n",
            "    launch(args)\n",
            "  File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/launch.py\", line 177, in launch\n",
            "    run(args)\n",
            "  File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py\", line 785, in run\n",
            "    elastic_launch(\n",
            "  File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py\", line 134, in __call__\n",
            "    return launch_agent(self._config, self._entrypoint, list(args))\n",
            "  File \"/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py\", line 250, in launch_agent\n",
            "    raise ChildFailedError(\n",
            "torch.distributed.elastic.multiprocessing.errors.ChildFailedError: \n",
            "============================================================\n",
            "tools/train.py FAILED\n",
            "------------------------------------------------------------\n",
            "Failures:\n",
            "  <NO_OTHER_FAILURES>\n",
            "------------------------------------------------------------\n",
            "Root Cause (first observed failure):\n",
            "[0]:\n",
            "  time      : 2023-07-21_22:27:53\n",
            "  host      : f6a49488322e\n",
            "  rank      : 0 (local_rank: 0)\n",
            "  exitcode  : 1 (pid: 9800)\n",
            "  error_file: <N/A>\n",
            "  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html\n",
            "============================================================\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "BGLApDEGK_v3"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}