coreweave · wbrown · May 14, 2024 · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024
diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml
@@ -5,6 +5,6 @@ exclude:
   - cuda: 11.8.0
     os: ubuntu22.04
 include:
-  - torch: 2.2.2
-    vision: 0.17.2
-    audio: 2.2.2
+  - torch: 2.3.0
+    vision: 0.18.0
+    audio: 2.3.0
diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
@@ -15,8 +15,8 @@ image:
   # Ubuntu 20.04
   - cuda: 12.2.2
     os: ubuntu20.04
-    nccl: 2.20.3-1
-    nccl-tests-hash: 868dc3d
+    nccl: 2.21.5-1
+    nccl-tests-hash: 027b52a
   - cuda: 12.1.1
     os: ubuntu20.04
     nccl: 2.18.3-1
@@ -30,6 +30,6 @@ image:
     nccl: 2.16.5-1
     nccl-tests-hash: 868dc3d
 include:
-  - torch: 2.2.2
-    vision: 0.17.2
-    audio: 2.2.2
+  - torch: 2.3.0
+    vision: 0.18.0
+    audio: 2.3.0
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -15,6 +15,10 @@ on:
       tag-suffix:
         required: false
         type: string
+      cache-key:
+        required: false
+        description: "Optional sub-key to append to the image name for build layer caching"
+        type: string
     outputs:
       outcome:
         description: "The outcome of the build"
@@ -60,6 +64,10 @@ jobs:
         if: inputs.tag-suffix != ''
         run: |
           echo "TAG_SUFFIX=-${{ inputs.tag-suffix }}" >> $GITHUB_ENV
+      - name: Set cache key
+        if: inputs.cache-key != ''
+        run: |
+          echo "CACHE_KEY=${{ inputs.image-name }}-${{ inputs.cache-key }}" >> $GITHUB_ENV
       - name: Extract metadata (tags, labels) for Docker
         id: meta
         uses: docker/metadata-action@v4.1.1
@@ -77,8 +85,8 @@ jobs:
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ inputs.image-name }}
-          cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ inputs.image-name }},mode=max
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
       - uses: 8BitJonny/gh-get-current-pr@2.1.3
         id: PR
         with:

diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml
@@ -41,4 +41,5 @@ jobs:
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
       torchaudio-version: ${{ matrix.audio }}
+      cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml
@@ -16,6 +16,9 @@ on:
         required: false
         type: boolean
         default: true
+      cache-key:
+        required: false
+        type: string
 
   workflow_dispatch:
     inputs:
@@ -144,14 +147,15 @@ jobs:
       # if you need to build for multiple versions of flash-attn
       # fix tag-suffix below to reflect your versions in the tag
       matrix:
-        flash-attn: [ 2.4.2 ]
+        flash-attn: [ 2.5.8 ]
     uses: ./.github/workflows/build.yml
     secrets: inherit
     with:
       image-name: ${{ inputs.image-name || 'torch-extras' }}
       folder: torch-extras
       #tag-suffix: ${{ inputs.tag }}-flash_attn${{ matrix.flash-attn }}
       tag-suffix: ${{ inputs.tag }}
+      cache-key: ${{ inputs.cache-key }}
       build-args: |
         BASE_IMAGE=${{ inputs.base-image }}
         FLASH_ATTN_VERSION=${{ matrix.flash-attn }}
@@ -164,7 +168,7 @@ jobs:
       matrix:
         # if you need to build for multiple versions of flash-attn
         # fix tag-suffix below to reflect your versions in the tag
-        flash-attn: [ 2.4.2 ]
+        flash-attn: [ 2.5.8 ]
         bases: ${{ fromJSON(needs.get-required-bases.outputs.bases-list) }}
     uses: ./.github/workflows/build.yml
     secrets: inherit

diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml
@@ -49,4 +49,5 @@ jobs:
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
       torchaudio-version: ${{ matrix.audio }}
+      cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }}
       build-extras: true
diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
@@ -116,6 +116,7 @@ jobs:
       torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
       torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
       triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }}
+      cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
   build-nccl:
     name: Build Nightly torch:nccl
@@ -136,4 +137,5 @@ jobs:
       torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
       torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
       triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }}
+      cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }}
       build-extras: true
diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml
@@ -33,6 +33,9 @@ on:
         required: false
         type: boolean
         default: false
+      cache-key:
+        required: false
+        type: string
 
   workflow_dispatch:
     inputs:
@@ -88,8 +91,9 @@ jobs:
       image-name: ${{ inputs.image-name || 'torch' }}
       folder: torch
       tag-suffix: ${{ inputs.tag }}
+      cache-key: ${{ inputs.cache-key }}
       build-args: |
-        BUILD_CCACHE_SIZE=250Mi
+        BUILD_CCACHE_SIZE=5Gi
         BUILDER_BASE_IMAGE=${{ inputs.builder-base-image }}
         FINAL_BASE_IMAGE=${{ inputs.base-image }}
         BUILD_TORCH_VERSION=${{ inputs.torch-version }}
@@ -107,3 +111,4 @@ jobs:
       tag: ${{ inputs.tag }}
       base-image: ${{ needs.build.outputs.tags }}
       image-name: ${{ inputs.image-name && format('{0}-extras', inputs.image-name) || '' }}
+      cache-key: ${{ inputs.cache-key }}
diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
@@ -1,9 +1,9 @@
 # syntax=docker/dockerfile:1.2
 
 ARG BASE_IMAGE
-ARG DEEPSPEED_VERSION="0.12.6"
-ARG FLASH_ATTN_VERSION="2.4.2"
-ARG APEX_COMMIT="2386a912164b0c5cfcd8be7a2b890fbac5607c82"
+ARG DEEPSPEED_VERSION="0.14.2"
+ARG FLASH_ATTN_VERSION="2.5.8"
+ARG APEX_COMMIT="a7de60e57f0534266841e1733262601ad76aaa74"
 ARG XFORMERS_VERSION="0.0.23.post1"
 
 FROM alpine/git:2.36.3 as flash-attn-downloader
@@ -89,7 +89,7 @@ FROM builder-base as deepspeed-builder
 # DeepSpeed build flags
 # See: https://www.deepspeed.ai/tutorials/advanced-install
 ARG DS_BUILD_OPS="1"
-ARG DS_BUILD_CCL_COMM=""
+ARG DS_BUILD_CCL_COMM="0"
 ARG DS_BUILD_CPU_ADAM=""
 ARG DS_BUILD_CPU_LION=""
 # Requires CUTLASS

diff --git a/torch/Dockerfile b/torch/Dockerfile
@@ -2,9 +2,9 @@
 ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.0.1-devel-ubuntu22.04"
 ARG FINAL_BASE_IMAGE="nvidia/cuda:12.0.1-base-ubuntu22.04"
 
-ARG BUILD_TORCH_VERSION="2.2.0"
-ARG BUILD_TORCH_VISION_VERSION="0.17.0"
-ARG BUILD_TORCH_AUDIO_VERSION="2.2.0"
+ARG BUILD_TORCH_VERSION="2.3.0"
+ARG BUILD_TORCH_VISION_VERSION="0.18.0"
+ARG BUILD_TORCH_AUDIO_VERSION="2.3.0"
 ARG BUILD_TRITON_VERSION=""
 ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX"
 # 8.7 is supported in the PyTorch main branch, but not 2.0.0
@@ -70,7 +70,7 @@ ARG BUILD_CCACHE_SIZE="1Gi"
 # ninja-build, ccache, and lld are optional but improve the build
 RUN apt-get -qq update && apt-get -qq install -y \
       libncurses5 python3 python3-pip git apt-utils ssh ca-certificates \
-      libpng-dev libjpeg-dev pkg-config python3-distutils python3-numpy \
+      libpng-dev libjpeg-dev pkg-config python3-distutils \
       build-essential ninja-build && \
     apt-get clean && \
     /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \
@@ -105,21 +105,25 @@ RUN apt-get -qq update && apt-get -qq install -y \
 RUN CODENAME="$(lsb_release -cs)" && \
     wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \
     apt-add-repository -n "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \
-    apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \
-    | sed -e '/connection timed out/{p; Q1}' && \
-    apt-get -qq install --no-install-recommends -y gcc-11 g++-11 lld-17 && \
-    apt-get clean && \
+    SETUP_TOOLCHAIN() { \
+        apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \
+        | sed -e '/connection timed out/{p; Q1}' && \
+        apt-get -qq install --no-install-recommends -y gcc-11 g++-11 lld-17 && \
+        apt-get clean; \
+    } && \
+    { SETUP_TOOLCHAIN || { sleep "$(shuf -i10-20 -n1)" && SETUP_TOOLCHAIN; }; } && \
     update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \
     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \
     update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1
 
 RUN mkdir /build /build/dist
 WORKDIR /build
 COPY --chmod=755 effective_cpu_count.sh .
+COPY --chmod=755 scale.sh .
 
 COPY <<-"EOT" /build/version-string.sh
     #!/bin/sh
-    set -x;
+    set -e;
     VERSION="$1";
 
     IS_HASH() {
@@ -136,8 +140,19 @@ COPY <<-"EOT" /build/version-string.sh
 EOT
 RUN chmod 755 /build/version-string.sh
 
+COPY <<-"EOT" /build/storage-info.sh
+    #!/bin/sh
+    set -e;
+    TARGET="$(realpath "$1")";
+
+    STORAGE_INFO="$(df -h '--output=fstype,used,avail,pcent,target' "$TARGET")" || exit 0;
+    printf 'Storage info for %s:\n%s\n' "$TARGET" "$STORAGE_INFO";
+EOT
+RUN chmod 755 /build/storage-info.sh
+
 ## Build torch
 RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/ \
+    pip3 install --no-cache-dir --upgrade numpy && \
     cd pytorch && pip3 install --no-cache-dir -r requirements.txt
 
 # Build tool & library paths, shared for all libraries to be built
@@ -151,10 +166,10 @@ ARG BUILD_TRITON_VERSION
 RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \
     --mount=type=cache,target=/ccache \
     if [ -n "$BUILD_TRITON_VERSION" ]; then \
-      export MAX_JOBS="$(./effective_cpu_count.sh)" && \
+      export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
       cd triton/python && \
-      python -m pip wheel -w wheels/ --no-build-isolation --no-deps -vv . && \
-      pip install wheels/*.whl; \
+      python3 -m pip wheel -w ../../dist/ --no-build-isolation --no-deps -vv . && \
+      pip3 install ../../dist/*.whl; \
     fi
 
 ARG BUILD_TORCH_VERSION
@@ -177,8 +192,11 @@ ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
 # remain the same.
 RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \
     --mount=type=cache,target=/ccache \
-    export MAX_JOBS="$(./effective_cpu_count.sh)" && \
+    export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
+    echo "MAX_JOBS: ${MAX_JOBS}" && \
+    ./storage-info.sh . && \
     cd pytorch && \
+    ../storage-info.sh . && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
     ln -s /usr/bin/c++ build/c++ && \
@@ -214,7 +232,7 @@ RUN pip3 install --no-cache-dir --upgrade \
 
 RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=vision/,rw \
     --mount=type=cache,target=/ccache \
-    export MAX_JOBS="$(./effective_cpu_count.sh)" && \
+    export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
     cd vision && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
@@ -250,7 +268,7 @@ RUN pip3 install --no-cache-dir --upgrade \
 
 RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/,rw \
     --mount=type=cache,target=/ccache \
-    export MAX_JOBS="$(./effective_cpu_count.sh)" && \
+    export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
     cd audio && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
@@ -285,7 +303,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 
 # Install core packages
 RUN apt-get -qq update && apt-get -qq install -y \
-      libncurses5 python3 python3-pip python3-distutils python3-numpy \
+      libncurses5 python3 python3-pip python3-distutils \
       libpng16-16 libjpeg-turbo8 libsodium23 \
       curl git apt-utils ssh ca-certificates tmux nano vim-tiny sudo bash \
       rsync htop wget unzip tini && \
@@ -297,10 +315,13 @@ RUN apt-get -qq update && apt-get -qq install -y \
 
 RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \
         software-properties-common && \
-    apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \
-    | sed -e '/connection timed out/{p; Q1}' && \
-    apt-get -qq install -y --no-install-recommends libstdc++6 && \
-    apt-get clean
+    SETUP_LIBSTDCXX() { \
+        apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \
+        | sed -e '/connection timed out/{p; Q1}' && \
+        apt-get -qq install -y --no-install-recommends libstdc++6 && \
+        apt-get clean; \
+    } && \
+    { SETUP_LIBSTDCXX || { sleep "$(shuf -i10-20 -n1)" && SETUP_LIBSTDCXX; }; }
 
 ARG BUILD_TORCH_VERSION
 ARG BUILD_TORCH_VISION_VERSION
@@ -342,4 +363,5 @@ WORKDIR /usr/src/app
 
 # Install custom PyTorch wheels.
 RUN --mount=type=bind,from=builder,source=/build/dist,target=. \
+    pip3 install --no-cache-dir -U numpy && \
     pip3 install --no-cache-dir -U ./*.whl
diff --git a/torch/scale.sh b/torch/scale.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+set -e;
+
+VAL="$1";
+DIVISOR="$2";
+MAXIMUM="$3";
+
+[ -n "$VAL" ];
+
+if [ -n "$DIVISOR" ];
+then VAL="$((( $VAL + $DIVISOR - 1 ) / $DIVISOR))";
+fi;
+
+if [ -n "$MAXIMUM" ];
+then VAL="$((VAL > MAXIMUM ? MAXIMUM : VAL))";
+fi;
+
+echo "$VAL";