bionemo-framework / Git / [be1279] /Dockerfile

Models:
Amanda-D/
bionemo-framework
Downloads: 1
[be1279]: / Dockerfile
History
Download this file
334 lines (281 with data), 13.6 kB

# Build instructions:
#
# For x86_64/amd64 (default):
#   docker build -t bionemo .
#   # Or explicitly:
#   docker build --build-arg TARGETARCH=amd64 -t bionemo .
#
# For ARM64:
#   docker build --build-arg TARGETARCH=arm64 -t bionemo .
#
# For multi-platform build:
#   docker buildx create --use
#   docker buildx build --platform linux/amd64,linux/arm64 -t bionemo .
#
# Base image with apex and transformer engine, but without NeMo or Megatron-LM.
#  Note that the core NeMo docker container is defined here:
#   https://gitlab-master.nvidia.com/dl/JoC/nemo-ci/-/blob/main/llm_train/Dockerfile.train
#  with settings that get defined/injected from this config:
#   https://gitlab-master.nvidia.com/dl/JoC/nemo-ci/-/blob/main/.gitlab-ci.yml
#  We should keep versions in our container up to date to ensure that we get the latest tested perf improvements and
#   training loss curves from NeMo.
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.01-py3

FROM rust:1.86.0 AS rust-env

RUN rustup set profile minimal && \
  rustup install 1.82.0 && \
  if [ "$TARGETARCH" = "arm64" ]; then \
    rustup target add aarch64-unknown-linux-gnu; \
  else \
    rustup target add x86_64-unknown-linux-gnu; \
  fi && \
  rustup default 1.82.0

FROM ${BASE_IMAGE} AS bionemo2-base
# Default to amd64 if no TARGETARCH is specified
ARG TARGETARCH=amd64

# Install core apt packages.
RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt,sharing=locked \
  --mount=type=cache,id=apt-lib,target=/var/lib/apt,sharing=locked \
  <<EOF
set -eo pipefail
apt-get update -qy
apt-get install -qyy \
  libsndfile1 \
  ffmpeg \
  git \
  curl \
  pre-commit \
  sudo \
  gnupg \
  unzip
apt-get upgrade -qyy \
  rsync
rm -rf /tmp/* /var/tmp/*
EOF

# Install AWS CLI based on architecture
RUN if [ "$TARGETARCH" = "arm64" ]; then \
      curl "https://awscli.amazonaws.com/awscli-exe-linux-aarch64.zip" -o "awscliv2.zip"; \
    elif [ "$TARGETARCH" = "amd64" ]; then \
      curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"; \
    else \
      echo "Unsupported architecture: $TARGETARCH" && exit 1; \
    fi && \
    unzip awscliv2.zip && \
    ./aws/install && \
    rm -rf aws awscliv2.zip

# Use a branch of causal_conv1d while the repository works on Blackwell support.
ARG CAUSAL_CONV_TAG=52e06e3d5ca10af0c7eb94a520d768c48ef36f1f
RUN CAUSAL_CONV1D_FORCE_BUILD=TRUE pip --disable-pip-version-check --no-cache-dir install git+https://github.com/trvachov/causal-conv1d.git@${CAUSAL_CONV_TAG}

###############################################################################
# ARM
###############################################################################
# Certain dependencies do not have prebuild ARM wheels/binaries, so we build them
# from source here. Overall, ecosystem ARM support is much weaker than x86, so below
# you'll see some hardcoded patches/versions/experimental branches to get i
# everything to work.

# Decord installation
RUN --mount=type=bind,source=./docker_build_patches/decord_ffmpeg6_fix.patch,target=/decord_ffmpeg6_fix.patch \
    if [ "$TARGETARCH" = "arm64" ]; then \
    export BUILD_DIR=/build && mkdir ${BUILD_DIR} && cd ${BUILD_DIR} && \
    apt-get update && \
    apt-get install -y build-essential python3-dev python3-setuptools make cmake && \
    apt-get install -y ffmpeg libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev && \
    git clone --recursive https://github.com/dmlc/decord && \
    cd decord && \
    git apply /decord_ffmpeg6_fix.patch && \
    mkdir build && cd build && \
    cmake .. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release && \
    make && \
    cd ../python && \
    pip install . && \
    cd / && rm -rf ${BUILD_DIR}; \
fi

# TileDB installation
RUN if [ "$TARGETARCH" = "arm64" ]; then \
    mkdir -p /usr/lib/tiledb && \
    cd /usr/lib/tiledb && \
    wget https://github.com/TileDB-Inc/TileDB/releases/download/2.27.2/tiledb-linux-arm64-2.27.2-1757013.tar.gz -O tiledb.tar.gz && \
    tar -xvzf tiledb.tar.gz && export TILEDB_PATH=/usr/lib/tiledb && \
    cd / && \
    dpkg -l | awk '/libfmt/ {print $2}' | xargs apt-get remove -y && \
    dpkg -l | awk '/spdlog/ {print $2}' | xargs apt-get remove -y && \
    rm -f /usr/lib/*/cmake/spdlog/spdlogConfig.cmake && \
    rm -f /usr/lib/cmake/spdlog/spdlogConfig.cmake && \
    git clone --single-branch --branch 1.16.1 https://github.com/single-cell-data/TileDB-SOMA.git && \
    cd TileDB-SOMA/apis/python && \
    pip install .; \
fi

# On ARM, bits and bytes needs to be built from scratch
RUN if [ "$TARGETARCH" = "arm64" ]; then \
    cd / && pip uninstall bitsandbytes && \
    git clone --single-branch --branch 0.45.5 https://github.com/bitsandbytes-foundation/bitsandbytes.git && \
    cd bitsandbytes && pip install . && cd .. && rm -rf bitsandbytes; \
fi
###############################################################################
# /end ARM
###############################################################################

# Mamba dependancy installation
RUN pip --disable-pip-version-check --no-cache-dir install \
  git+https://github.com/state-spaces/mamba.git@v2.2.2 --no-deps

# Nemo Run installation
# Some things are pip installed in advance to avoid dependency issues during nemo_run installation
RUN pip install hatchling urllib3  # needed to install nemo-run
ARG NEMU_RUN_TAG=v0.3.0
RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMU_RUN_TAG} --use-deprecated=legacy-resolver

# Rapids SingleCell Installation
RUN pip install 'rapids-singlecell' --extra-index-url=https://pypi.nvidia.com

RUN mkdir -p /workspace/bionemo2/

WORKDIR /workspace

# Addressing Security Scan Vulnerabilities
RUN rm -rf /opt/pytorch/pytorch/third_party/onnx


# Use UV to install python packages from the workspace. This just installs packages into the system's python
# environment, and does not use the current uv.lock file. Note that with python 3.12, we now need to set
# UV_BREAK_SYSTEM_PACKAGES, since the pytorch base image has made the decision not to use a virtual environment and UV
# does not respect the PIP_BREAK_SYSTEM_PACKAGES environment variable set in the base dockerfile.
COPY --from=ghcr.io/astral-sh/uv:0.6.13 /uv /usr/local/bin/uv
ENV UV_LINK_MODE=copy \
  UV_COMPILE_BYTECODE=1 \
  UV_PYTHON_DOWNLOADS=never \
  UV_SYSTEM_PYTHON=true \
  UV_BREAK_SYSTEM_PACKAGES=1

# Install the bionemo-geometric requirements ahead of copying over the rest of the repo, so that we can cache their
# installation. These involve building some torch extensions, so they can take a while to install.
RUN --mount=type=bind,source=./sub-packages/bionemo-geometric/requirements.txt,target=/requirements-pyg.txt \
  --mount=type=cache,target=/root/.cache \
  uv pip install --no-build-isolation -r /requirements-pyg.txt

COPY --from=rust-env /usr/local/cargo /usr/local/cargo
COPY --from=rust-env /usr/local/rustup /usr/local/rustup

ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
ENV RUSTUP_HOME="/usr/local/rustup"

WORKDIR /workspace/bionemo2

# Install 3rd-party deps and bionemo submodules.
COPY ./LICENSE /workspace/bionemo2/LICENSE
COPY ./3rdparty /workspace/bionemo2/3rdparty
COPY ./sub-packages /workspace/bionemo2/sub-packages

RUN --mount=type=bind,source=./requirements-test.txt,target=/requirements-test.txt \
  --mount=type=bind,source=./requirements-cve.txt,target=/requirements-cve.txt \
  --mount=type=cache,target=/root/.cache <<EOF
set -eo pipefail
uv pip install maturin --no-build-isolation
# install nvidia-resiliency-ext separately because it doesn't yet have ARM wheels
git clone https://github.com/NVIDIA/nvidia-resiliency-ext
uv pip install nvidia-resiliency-ext/
rm -rf nvidia-resiliency-ext/
# ngcsdk causes strange dependency conflicts (ngcsdk requires protobuf<4, but nemo_toolkit requires protobuf==4.24.4, deleting it from the uv pip install prevents installation conflicts)
sed -i "/ngcsdk/d" ./sub-packages/bionemo-core/pyproject.toml
# Remove llama-index because bionemo doesn't use it and it adds CVEs to container
sed -i "/llama-index/d" ./3rdparty/NeMo/requirements/requirements_nlp.txt
uv pip install --no-build-isolation \
./3rdparty/*  \
./sub-packages/bionemo-* \
-r /requirements-cve.txt \
-r /requirements-test.txt

# Install back ngcsdk, as a WAR for the protobuf version conflict with nemo_toolkit.
uv pip install ngcsdk

# Addressing security scan issue - CVE vulnerability https://github.com/advisories/GHSA-g4r7-86gm-pgqc The package is a
# dependency of lm_eval from NeMo requirements_eval.txt. We also remove zstandard, another dependency of lm_eval, which
# seems to be causing issues with NGC downloads. See https://nvbugspro.nvidia.com/bug/5149698
uv pip uninstall sqlitedict zstandard

rm -rf ./3rdparty
rm -rf /tmp/*
rm -rf ./sub-packages/bionemo-noodles/target
EOF

# In the devcontainer image, we just copy over the finished `dist-packages` folder from the build image back into the
# base pytorch container. We can then set up a non-root user and uninstall the bionemo and 3rd-party packages, so that
# they can be installed in an editable fashion from the workspace directory. This lets us install all the package
# dependencies in a cached fashion, so they don't have to be built from scratch every time the devcontainer is rebuilt.
FROM ${BASE_IMAGE} AS dev

RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt,sharing=locked \
  --mount=type=cache,id=apt-lib,target=/var/lib/apt,sharing=locked \
  <<EOF
set -eo pipefail
apt-get update -qy
apt-get install -qyy \
  sudo
rm -rf /tmp/* /var/tmp/*
EOF

# Use a non-root user to use inside a devcontainer (with ubuntu 23 and later, we can use the default ubuntu user).
ARG USERNAME=ubuntu
RUN echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \
  && chmod 0440 /etc/sudoers.d/$USERNAME

# Here we delete the dist-packages directory from the pytorch base image, and copy over the dist-packages directory from
# the build image. This ensures we have all the necessary dependencies installed (megatron, nemo, etc.).
RUN <<EOF
  set -eo pipefail
  rm -rf /usr/local/lib/python3.12/dist-packages
  mkdir -p /usr/local/lib/python3.12/dist-packages
  chmod 777 /usr/local/lib/python3.12/dist-packages
  chmod 777 /usr/local/bin
EOF

USER $USERNAME

COPY --from=bionemo2-base --chown=$USERNAME:$USERNAME --chmod=777 \
  /usr/local/lib/python3.12/dist-packages /usr/local/lib/python3.12/dist-packages

COPY --from=ghcr.io/astral-sh/uv:0.6.13 /uv /usr/local/bin/uv
ENV UV_LINK_MODE=copy \
  UV_COMPILE_BYTECODE=0 \
  UV_PYTHON_DOWNLOADS=never \
  UV_SYSTEM_PYTHON=true \
  UV_BREAK_SYSTEM_PACKAGES=1

# Bring in the rust toolchain, as maturin is a dependency listed in requirements-dev
COPY --from=rust-env /usr/local/cargo /usr/local/cargo
COPY --from=rust-env /usr/local/rustup /usr/local/rustup

ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
ENV RUSTUP_HOME="/usr/local/rustup"

RUN --mount=type=bind,source=./requirements-dev.txt,target=/workspace/bionemo2/requirements-dev.txt \
  --mount=type=cache,target=/root/.cache <<EOF
  set -eo pipefail
  uv pip install -r /workspace/bionemo2/requirements-dev.txt
  rm -rf /tmp/*
EOF

RUN <<EOF
  set -eo pipefail
  rm -rf /usr/local/lib/python3.12/dist-packages/bionemo*
  pip uninstall -y nemo_toolkit megatron_core
EOF


# Transformer engine attention defaults
# FIXME the following result in unstable training curves even if they are faster
#  see https://github.com/NVIDIA/bionemo-framework/pull/421
#ENV NVTE_FUSED_ATTN=1 NVTE_FLASH_ATTN=0
FROM dev AS development

WORKDIR /workspace/bionemo2
COPY --from=bionemo2-base /workspace/bionemo2/ .
COPY ./internal ./internal
# because of the `rm -rf ./3rdparty` in bionemo2-base
COPY ./3rdparty ./3rdparty

USER root
COPY --from=rust-env /usr/local/cargo /usr/local/cargo
COPY --from=rust-env /usr/local/rustup /usr/local/rustup

ENV PATH="/usr/local/cargo/bin:/usr/local/rustup/bin:${PATH}"
ENV RUSTUP_HOME="/usr/local/rustup"

RUN <<EOF
set -eo pipefail
find . -name __pycache__ -type d -print | xargs rm -rf
uv pip install --no-build-isolation --editable ./internal/infra-bionemo
for sub in ./3rdparty/* ./sub-packages/bionemo-*; do
    uv pip install --no-deps --no-build-isolation --editable $sub
done
EOF

# Since the entire repo is owned by root, switching username for development breaks things.
ARG USERNAME=ubuntu
RUN chown $USERNAME:$USERNAME -R /workspace/bionemo2/
USER $USERNAME

# The 'release' target needs to be last so that it's the default build target. In the future, we could consider a setup
# similar to the devcontainer above, where we copy the dist-packages folder from the build image into the release image.
# This would reduce the overall image size by reducing the number of intermediate layers. In the meantime, we match the
# existing release image build by copying over remaining files from the repo into the container.
FROM bionemo2-base AS release

RUN mkdir -p /workspace/bionemo2/.cache/

COPY VERSION .
COPY ./scripts ./scripts
COPY ./README.md ./
# Copy over folders so that the image can run tests in a self-contained fashion.
COPY ./ci/scripts ./ci/scripts
COPY ./docs ./docs

COPY --from=rust-env /usr/local/cargo /usr/local/cargo
COPY --from=rust-env /usr/local/rustup /usr/local/rustup


# RUN rm -rf /usr/local/cargo /usr/local/rustup
RUN chmod 777 -R /workspace/bionemo2/

# Transformer engine attention defaults
# We have to declare this again because the devcontainer splits from the release image's base.
# FIXME the following results in unstable training curves even if faster.
#  See https://github.com/NVIDIA/bionemo-framework/pull/421
# ENV NVTE_FUSED_ATTN=1 NVTE_FLASH_ATTN=0