Update dockerfile and add docker build action (#3283)

jkulhanek · ginazhouhuiwu · brentyi · web-flow · commit f86dbe6d455a · 2024-09-05T10:12:21.000+02:00
* Use github docker registry

* Fix DDP train for GPU in exclusive mode

* Improve docker image - compile gsplat, decrease image size

* Drop unrelated change

* Add build docker image action

* Rename build docker image action

* nit

* Remove commented line from Dockerfile

* Fix dockerfile when explicit source is specified

* Fix failing dynamo build for torch.compile

* Lock dockerfile and tcnn versions

* Add `torch.cuda.is_available()` condition

* Drop set_cuda_device

* Fix build docker image github action

* Docker build save disk space

* Set MAX_JOBS to limit resource usage for docker build

* Try bumping `MAX_JOBS` 2 =&gt; 4

* Install fixed gsplat version from nerfstudio's pyproject.toml

* Update docs

* Finish docker build action

* Fix ignores push when PR

---------

Co-authored-by: Gina Wu &lt;ginawu98@gmail.com&gt;
Co-authored-by: Brent Yi &lt;yibrenth@gmail.com&gt;
diff --git a/.github/workflows/build_docker_image.yml b/.github/workflows/build_docker_image.yml
@@ -0,0 +1,62 @@
+name: Build Docker Image
+on:
+  workflow_dispatch:
+  workflow_call:
+  pull_request:
+  push:
+    branches:
+      - main
+      - master
+    tags:
+      - 'v*'
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+jobs:
+  build-and-publish-docker-image:
+    runs-on: ubuntu-latest
+    name: build-and-publish-docker-image
+    permissions:
+      packages: write
+      contents: read
+      attestations: write
+      id-token: write
+    steps:
+      - uses: actions/checkout@v4
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=ref,event=branch
+            type=ref,event=pr
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+      - name: Free root space
+        uses: almahmoud/free-root-space@main
+        with:
+          remove-gcc: false
+          remove-cplusplus: false
+      - name: Build and push Docker image
+        id: push
+        uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671
+        with:
+          context: .
+          file: ./Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+      - name: Generate artifact attestation
+        uses: actions/attest-build-provenance@v1
+        with:
+          subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
+          subject-digest: ${{ steps.push.outputs.digest }}
+          push-to-registry: ${{ github.event_name != 'pull_request' }}
+
diff --git a/Dockerfile b/Dockerfile
@@ -1,185 +1,133 @@
-ARG CUDA_VERSION=11.8.0
-ARG OS_VERSION=22.04
-# Define base image.
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${OS_VERSION}
-ARG CUDA_VERSION
-ARG OS_VERSION
+# syntax=docker/dockerfile:1
+ARG UBUNTU_VERSION=22.04
+ARG NVIDIA_CUDA_VERSION=11.8.0
+# CUDA architectures, required by Colmap and tiny-cuda-nn. Use >= 8.0 for faster TCNN.
+ARG CUDA_ARCHITECTURES="90;89;86;80;75;70;61"
+ARG NERFSTUDIO_VERSION=""
+
+# Pull source either provided or from git.
+FROM scratch as source_copy
+ONBUILD COPY . /tmp/nerfstudio
+FROM alpine/git as source_no_copy
+ARG NERFSTUDIO_VERSION
+ONBUILD RUN git clone --branch ${NERFSTUDIO_VERSION} --recursive https://github.com/nerfstudio-project/nerfstudio.git /tmp/nerfstudio
+ARG NERFSTUDIO_VERSION
+FROM source_${NERFSTUDIO_VERSION:+no_}copy as source
+
+FROM nvidia/cuda:${NVIDIA_CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} as builder
+ARG CUDA_ARCHITECTURES
+ARG NVIDIA_CUDA_VERSION
+ARG UBUNTU_VERSION
 
-# Define username, user uid and gid
-ARG USERNAME=user
-ARG USER_UID=1000
-ARG USER_GID=$USER_UID
-
-# metainformation
-LABEL org.opencontainers.image.version = "0.1.18"
-LABEL org.opencontainers.image.source = "https://github.com/nerfstudio-project/nerfstudio"
-LABEL org.opencontainers.image.licenses = "Apache License 2.0"
-LABEL org.opencontainers.image.base.name="docker.io/library/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${OS_VERSION}"
-
-# Variables used at build time.
-## CUDA architectures, required by Colmap and tiny-cuda-nn.
-## NOTE: All commonly used GPU architectures are included and supported here. To speedup the image build process remove all architectures but the one of your explicit GPU. Find details here: https://developer.nvidia.com/cuda-gpus (8.6 translates to 86 in the line below) or in the docs.
-ARG CUDA_ARCHITECTURES=90;89;86;80;75;70;61;52;37
-
-# Set environment variables.
-## Set non-interactive to prevent asking for user inputs blocking image creation.
 ENV DEBIAN_FRONTEND=noninteractive
-## Set timezone as it is required by some packages.
-ENV TZ=Europe/Berlin
-## CUDA Home, required to find CUDA in some packages.
-ENV CUDA_HOME="/usr/local/cuda"
-
-# Install required apt packages and clear cache afterwards.
+ENV QT_XCB_GL_INTEGRATION=xcb_egl
 RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    build-essential \
-    cmake \
-    curl \
-    ffmpeg \
-    git \
-    libatlas-base-dev \
-    libboost-filesystem-dev \
-    libboost-graph-dev \
-    libboost-program-options-dev \
-    libboost-system-dev \
-    libboost-test-dev \
-    libhdf5-dev \
-    libcgal-dev \
-    libeigen3-dev \
-    libflann-dev \
-    libfreeimage-dev \
-    libgflags-dev \
-    libglew-dev \
-    libgoogle-glog-dev \
-    libmetis-dev \
-    libprotobuf-dev \
-    libqt5opengl5-dev \
-    libsqlite3-dev \
-    libsuitesparse-dev \
-    nano \
-    protobuf-compiler \
-    python-is-python3 \
-    python3.10-dev \
-    python3-pip \
-    qtbase5-dev \
-    sudo \
-    vim-tiny \
-    wget && \
-    rm -rf /var/lib/apt/lists/*
-
-
-# Install GLOG (required by ceres).
-RUN git clone --branch v0.6.0 https://github.com/google/glog.git --single-branch && \
-    cd glog && \
-    mkdir build && \
-    cd build && \
-    cmake .. && \
-    make -j `nproc` && \
-    make install && \
-    cd ../.. && \
-    rm -rf glog
-# Add glog path to LD_LIBRARY_PATH.
-ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
-
-# Install Ceres-solver (required by colmap).
-RUN git clone --branch 2.1.0 https://ceres-solver.googlesource.com/ceres-solver.git --single-branch && \
-    cd ceres-solver && \
-    git checkout $(git describe --tags) && \
-    mkdir build && \
-    cd build && \
-    cmake .. -DBUILD_TESTING=OFF -DBUILD_EXAMPLES=OFF && \
-    make -j `nproc` && \
-    make install && \
-    cd ../.. && \
-    rm -rf ceres-solver
-
-# Install colmap.
-RUN git clone --branch 3.8 https://github.com/colmap/colmap.git --single-branch && \
+    apt-get install -y --no-install-recommends --no-install-suggests \
+        git \
+        cmake \
+        ninja-build \
+        build-essential \
+        libboost-program-options-dev \
+        libboost-filesystem-dev \
+        libboost-graph-dev \
+        libboost-system-dev \
+        libeigen3-dev \
+        libflann-dev \
+        libfreeimage-dev \
+        libmetis-dev \
+        libgoogle-glog-dev \
+        libgtest-dev \
+        libsqlite3-dev \
+        libglew-dev \
+        qtbase5-dev \
+        libqt5opengl5-dev \
+        libcgal-dev \
+        libceres-dev \
+        python3.10-dev \
+        python3-pip
+
+# Build and install COLMAP.
+RUN git clone https://github.com/colmap/colmap.git && \
     cd colmap && \
+    git checkout "3.9.1" && \
     mkdir build && \
     cd build && \
-    cmake .. -DCUDA_ENABLED=ON \
-             -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHITECTURES} && \
-    make -j `nproc` && \
-    make install && \
-    cd ../.. && \
-    rm -rf colmap
-
-# Create non root user, add it to custom group and setup environment.
-RUN groupadd --gid $USER_GID $USERNAME \
-    && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME -d /home/${USERNAME} --shell /usr/bin/bash 
-# OPTIONAL
-# If sudo privilages are not required comment below line
-# Create simple password for user and add it to sudo group
-# Update group so that it is not required to type password for commands: apt update/upgrade/install/remove
-RUN echo "${USERNAME}:password" | chpasswd \
-    && usermod -aG sudo ${USERNAME} \
-    && echo "%sudo ALL=NOPASSWD:/usr/bin/apt-get update, /usr/bin/apt-get upgrade, /usr/bin/apt-get install, /usr/bin/apt-get remove" >> /etc/sudoers
+    mkdir -p /build && \
+    cmake .. -GNinja "-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHITECTURES}" \
+        -DCMAKE_INSTALL_PREFIX=/build/colmap && \
+    ninja install -j1 && \
+    cd ~
+
+# Upgrade pip and install dependencies.
+# pip install torch==2.2.2 torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu118 && \
+RUN pip install --no-cache-dir --upgrade pip 'setuptools<70.0.0' && \
+    pip install --no-cache-dir torch==2.1.2+cu118 torchvision==0.16.2+cu118 'numpy<2.0.0' --extra-index-url https://download.pytorch.org/whl/cu118 && \
+    git clone --branch master --recursive https://github.com/cvg/Hierarchical-Localization.git /opt/hloc && \
+    cd /opt/hloc && git checkout v1.4 && python3.10 -m pip install --no-cache-dir . && cd ~ && \
+    TCNN_CUDA_ARCHITECTURES="${CUDA_ARCHITECTURES}" pip install --no-cache-dir "git+https://github.com/NVlabs/tiny-cuda-nn.git@b3473c81396fe927293bdfd5a6be32df8769927c#subdirectory=bindings/torch" && \
+    pip install --no-cache-dir pycolmap==0.6.1 pyceres==2.1 omegaconf==2.3.0
+
+# Install gsplat and nerfstudio.
+# NOTE: both are installed jointly in order to prevent docker cache with latest
+# gsplat version (we do not expliticly specify the commit hash).
+#
+# We set MAX_JOBS to reduce resource usage for GH actions:
+# - https://github.com/nerfstudio-project/gsplat/blob/db444b904976d6e01e79b736dd89a1070b0ee1d0/setup.py#L13-L23
+COPY --from=source /tmp/nerfstudio/ /tmp/nerfstudio
+RUN export TORCH_CUDA_ARCH_LIST="$(echo "$CUDA_ARCHITECTURES" | tr ';' '\n' | awk '$0 > 70 {print substr($0,1,1)"."substr($0,2)}' | tr '\n' ' ' | sed 's/ $//')" && \
+    export MAX_JOBS=4 && \
+    GSPLAT_VERSION="$(sed -n 's/.*gsplat==\s*\([^," '"'"']*\).*/\1/p' /tmp/nerfstudio/pyproject.toml)" && \
+    pip install --no-cache-dir git+https://github.com/nerfstudio-project/gsplat.git@v${GSPLAT_VERSION} && \
+    pip install --no-cache-dir /tmp/nerfstudio 'numpy<2.0.0' && \
+    rm -rf /tmp/nerfstudio
+
+# Fix permissions
+RUN chmod -R go=u /usr/local/lib/python3.10 && \
+    chmod -R go=u /build
+
+#
+# Docker runtime stage.
+#
+FROM nvidia/cuda:${NVIDIA_CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} as runtime
+ARG CUDA_ARCHITECTURES
+ARG NVIDIA_CUDA_VERSION
+ARG UBUNTU_VERSION
 
-# Create workspace folder and change ownership to new user
-RUN mkdir /workspace && chown ${USER_UID}:${USER_GID} /workspace
-
-# Switch to new user and workdir.
-USER ${USER_UID}
-WORKDIR /home/${USERNAME}
-
-# Add local user binary folder to PATH variable.
-ENV PATH="${PATH}:/home/${USERNAME}/.local/bin"
-
-# Upgrade pip and install packages.
-RUN python3.10 -m pip install --no-cache-dir --upgrade pip setuptools==69.5.1 pathtools promise pybind11 omegaconf
-
-# Install pytorch and submodules
-# echo "${CUDA_VERSION}" | sed 's/.$//' | tr -d '.' -- CUDA_VERSION -> delete last digit -> delete all '.'
-RUN CUDA_VER=$(echo "${CUDA_VERSION}" | sed 's/.$//' | tr -d '.') && python3.10 -m pip install --no-cache-dir \
-    torch==2.1.2+cu${CUDA_VER} \
-    torchvision==0.16.2+cu${CUDA_VER} \
-        --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VER}
-
-# Install tiny-cuda-nn (we need to set the target architectures as environment variable first).
-ENV TCNN_CUDA_ARCHITECTURES=${CUDA_ARCHITECTURES}
-RUN python3.10 -m pip install --no-cache-dir git+https://github.com/NVlabs/tiny-cuda-nn.git#subdirectory=bindings/torch
-
-# Install pycolmap, required by hloc.
-RUN git clone --branch v0.4.0 --recursive https://github.com/colmap/pycolmap.git && \
-    cd pycolmap && \
-    python3.10 -m pip install --no-cache-dir . && \
-    cd ..
-
-# Install hloc 1.4 as alternative feature detector and matcher option for nerfstudio.
-RUN git clone --branch master --recursive https://github.com/cvg/Hierarchical-Localization.git && \
-    cd Hierarchical-Localization && \
-    git checkout v1.4 && \
-    python3.10 -m pip install --no-cache-dir -e . && \
-    cd ..
-
-# Install pyceres from source
-RUN git clone --branch v1.0 --recursive https://github.com/cvg/pyceres.git && \
-    cd pyceres && \
-    python3.10 -m pip install --no-cache-dir -e . && \
-    cd ..
-
-# Install pixel perfect sfm.
-RUN git clone --recursive https://github.com/cvg/pixel-perfect-sfm.git && \
-    cd pixel-perfect-sfm && \
-    git reset --hard 40f7c1339328b2a0c7cf71f76623fb848e0c0357 && \
-    git clean -df && \
-    python3.10 -m pip install --no-cache-dir -e . && \
-    cd ..
-
-# Copy nerfstudio folder and give ownership to user.
-COPY --chown=${USER_UID}:${USER_GID} . /home/${USERNAME}/nerfstudio
-
-# Install nerfstudio dependencies.
-RUN cd nerfstudio && \
-    python3.10 -m pip install --no-cache-dir -e . && \
-    cd ..
+LABEL org.opencontainers.image.source = "https://github.com/nerfstudio-project/nerfstudio"
+LABEL org.opencontainers.image.licenses = "Apache License 2.0"
+LABEL org.opencontainers.image.base.name="docker.io/library/nvidia/cuda:${NVIDIA_CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
+LABEL org.opencontainers.image.documentation = "https://docs.nerf.studio/"
 
-# Switch to workspace folder and install nerfstudio cli auto completion
-WORKDIR /workspace
-RUN ns-install-cli --mode install
+# Minimal dependencies to run COLMAP binary compiled in the builder stage.
+# Note: this reduces the size of the final image considerably, since all the
+# build dependencies are not needed.
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends --no-install-suggests \
+        libboost-filesystem1.74.0 \
+        libboost-program-options1.74.0 \
+        libc6 \
+        libceres2 \
+        libfreeimage3 \
+        libgcc-s1 \
+        libgl1 \
+        libglew2.2 \
+        libgoogle-glog0v5 \
+        libqt5core5a \
+        libqt5gui5 \
+        libqt5widgets5 \
+        python3.10 \
+        python3.10-dev \
+        build-essential \
+        python-is-python3 \
+        ffmpeg
+
+# Copy packages from builder stage.
+COPY --from=builder /build/colmap/ /usr/local/
+COPY --from=builder /usr/local/lib/python3.10/dist-packages/ /usr/local/lib/python3.10/dist-packages/
+COPY --from=builder /usr/local/bin/ns* /usr/local/bin/
+
+# Install nerfstudio cli auto completion
+RUN /bin/bash -c 'ns-install-cli --mode install'
 
 # Bash as default entrypoint.
 CMD /bin/bash -l
-# Force changing password on first container run
-# Change line above: CMD /bin/bash -l -> CMD /bin/bash -l -c passwd && /usr/bin/bash -l
diff --git a/docs/quickstart/installation.md b/docs/quickstart/installation.md