From 9a5990d38e2e1286c33ef7688e68a16af93c4905 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 4 Nov 2024 14:48:38 -0600
Subject: [PATCH 01/94] feat(torch): Update PyTorch to v2.5.1 & update CUDA
 12.6

---
 .github/configurations/torch-base.yml | 4 ++--
 .github/configurations/torch-nccl.yml | 2 +-
 torch/Dockerfile                      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml
index 761f2b6b..4c94e304 100644
--- a/.github/configurations/torch-base.yml
+++ b/.github/configurations/torch-base.yml
@@ -1,6 +1,6 @@
-cuda: [ 12.6.1, 12.4.1, 12.2.2 ]
+cuda: [ 12.6.2, 12.4.1, 12.2.2 ]
 os: [ ubuntu22.04, ubuntu20.04 ]
 include:
-  - torch: 2.5.0
+  - torch: 2.5.1
     vision: 0.20.0
     audio: 2.5.0
diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
index 6bd5f029..84c743ed 100644
--- a/.github/configurations/torch-nccl.yml
+++ b/.github/configurations/torch-nccl.yml
@@ -32,6 +32,6 @@ image:
     nccl: 2.21.5-1
     nccl-tests-hash: 2ff05b2
 include:
-  - torch: 2.5.0
+  - torch: 2.5.1
     vision: 0.20.0
     audio: 2.5.0
diff --git a/torch/Dockerfile b/torch/Dockerfile
index 6705427d..24f7f1f2 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -2,7 +2,7 @@
 ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.4.1-devel-ubuntu22.04"
 ARG FINAL_BASE_IMAGE="nvidia/cuda:12.4.1-base-ubuntu22.04"
 
-ARG BUILD_TORCH_VERSION="2.5.0"
+ARG BUILD_TORCH_VERSION="2.5.1"
 ARG BUILD_TORCH_VISION_VERSION="0.20.0"
 ARG BUILD_TORCH_AUDIO_VERSION="2.5.0"
 ARG BUILD_TRANSFORMERENGINE_VERSION="1.11"

From eec6daabd4e7899216875b9cb53ee341697bafc5 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 4 Nov 2024 16:11:33 -0600
Subject: [PATCH 02/94] feat(torch): Update `torch:nccl` base images

---
 .github/configurations/torch-nccl.yml | 25 +++++--------------------
 .github/workflows/torch-nccl.yml      |  4 ++--
 .github/workflows/torch-nightly.yml   |  4 ++--
 3 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
index 84c743ed..a34899b7 100644
--- a/.github/configurations/torch-nccl.yml
+++ b/.github/configurations/torch-nccl.yml
@@ -1,37 +1,22 @@
 image:
   # Ubuntu 22.04
-  - cuda: 12.6.1
-    cudnn: cudnn
+  - cuda: 12.6.2
     os: ubuntu22.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
   - cuda: 12.4.1
-    cudnn: cudnn
     os: ubuntu22.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
   - cuda: 12.2.2
-    cudnn: cudnn8
     os: ubuntu22.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
   # Ubuntu 20.04
-  - cuda: 12.6.1
-    cudnn: cudnn
+  - cuda: 12.6.2
     os: ubuntu20.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
   - cuda: 12.4.1
-    cudnn: cudnn
     os: ubuntu20.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
   - cuda: 12.2.2
-    cudnn: cudnn8
     os: ubuntu20.04
-    nccl: 2.21.5-1
-    nccl-tests-hash: 2ff05b2
 include:
   - torch: 2.5.1
     vision: 0.20.0
     audio: 2.5.0
+    image:
+      nccl: 2.23.4-1
+      nccl-tests-hash: c58f522
diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml
index aee13052..0c78bc55 100644
--- a/.github/workflows/torch-nccl.yml
+++ b/.github/workflows/torch-nccl.yml
@@ -44,8 +44,8 @@ jobs:
     with:
       image-name: ${{ inputs.image-name }}
       tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.image.cuda, matrix.image.os, matrix.image.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
-      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
-      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
+      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
+      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
       torchaudio-version: ${{ matrix.audio }}
diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
index 063e40af..d12e09de 100644
--- a/.github/workflows/torch-nightly.yml
+++ b/.github/workflows/torch-nightly.yml
@@ -131,8 +131,8 @@ jobs:
     with:
       image-name: nightly-torch
       tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.image.cuda, matrix.image.os, matrix.image.nccl, needs.get-nightly-info.outputs.version-string ) }}
-      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
-      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
+      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
+      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
       torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
       torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
       torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}

From e411a3d961f4f4487cb6793620d8db8c459d6f8e Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 5 Nov 2024 13:44:40 -0600
Subject: [PATCH 03/94] ci(torch): Change `torch:nccl` matrix build layout

---
 .github/configurations/torch-nccl.yml | 22 ++++------------------
 .github/workflows/torch-nccl.yml      |  8 ++++----
 .github/workflows/torch-nightly.yml   |  8 ++++----
 3 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
index a34899b7..74da04cc 100644
--- a/.github/configurations/torch-nccl.yml
+++ b/.github/configurations/torch-nccl.yml
@@ -1,22 +1,8 @@
-image:
-  # Ubuntu 22.04
-  - cuda: 12.6.2
-    os: ubuntu22.04
-  - cuda: 12.4.1
-    os: ubuntu22.04
-  - cuda: 12.2.2
-    os: ubuntu22.04
-  # Ubuntu 20.04
-  - cuda: 12.6.2
-    os: ubuntu20.04
-  - cuda: 12.4.1
-    os: ubuntu20.04
-  - cuda: 12.2.2
-    os: ubuntu20.04
+cuda: [ 12.6.2, 12.4.1, 12.2.2 ]
+os: [ ubuntu22.04, ubuntu20.04 ]
 include:
   - torch: 2.5.1
     vision: 0.20.0
     audio: 2.5.0
-    image:
-      nccl: 2.23.4-1
-      nccl-tests-hash: c58f522
+    nccl: 2.23.4-1
+    nccl-tests-hash: c58f522
diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml
index 0c78bc55..9815639d 100644
--- a/.github/workflows/torch-nccl.yml
+++ b/.github/workflows/torch-nccl.yml
@@ -43,11 +43,11 @@ jobs:
     secrets: inherit
     with:
       image-name: ${{ inputs.image-name }}
-      tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.image.cuda, matrix.image.os, matrix.image.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
-      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
-      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
+      tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.cuda, matrix.os, matrix.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
+      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
+      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
       torchaudio-version: ${{ matrix.audio }}
-      cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }}
+      cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
index d12e09de..bdf4a1b8 100644
--- a/.github/workflows/torch-nightly.yml
+++ b/.github/workflows/torch-nightly.yml
@@ -130,12 +130,12 @@ jobs:
     secrets: inherit
     with:
       image-name: nightly-torch
-      tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.image.cuda, matrix.image.os, matrix.image.nccl, needs.get-nightly-info.outputs.version-string ) }}
-      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
-      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
+      tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.cuda, matrix.os, matrix.nccl, needs.get-nightly-info.outputs.version-string ) }}
+      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
+      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
       torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
       torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
       torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
       triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }}
-      cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }}
+      cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true

From 86120387a3dcaff6439ac0b808137a286a3efd8b Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 6 Nov 2024 16:11:59 -0600
Subject: [PATCH 04/94] ci: Update action versions

---
 .github/workflows/build.yml | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index cbb09fdc..7301747e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -39,20 +39,15 @@ jobs:
       tags: ${{ steps.meta.outputs.tags }}
       version: ${{ steps.meta.outputs.version }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2.2.1
+        uses: docker/setup-buildx-action@v3.7.1
       - name: Login to GitHub container registry
         uses: docker/login-action@v2.2.0
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Login to DockerHub container registry
-        uses: docker/login-action@v2.2.0
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
       - name: Get base registry
         run: |
           echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV
@@ -70,14 +65,14 @@ jobs:
           echo "CACHE_KEY=${{ inputs.image-name }}-${{ inputs.cache-key }}" >> $GITHUB_ENV
       - name: Extract metadata (tags, labels) for Docker
         id: meta
-        uses: docker/metadata-action@v4.1.1
+        uses: docker/metadata-action@v5.5.1
         with:
           images: ${{ env.REGISTRY }}/${{ inputs.image-name }}
           tags: |
             type=sha,prefix=${{ env.TAG_PREFIX }},suffix=${{ env.TAG_SUFFIX }},format=short
       - name: Build and push Docker image
         id: docker-build
-        uses: docker/build-push-action@v3.2.0
+        uses: docker/build-push-action@v6.9.0
         with:
           context: ${{ inputs.folder }}
           build-args: |-
@@ -87,6 +82,7 @@ jobs:
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
           cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
+          platforms: linux/amd64
       - uses: 8BitJonny/gh-get-current-pr@2.1.3
         id: PR
         with:

From f58c927d7907931cf00d7a618d4a54e9f36cd4e8 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 6 Nov 2024 16:31:59 -0600
Subject: [PATCH 05/94] ci: Use remote BuildKit worker & new runners

---
 .github/workflows/build.yml              | 38 +++++++++++++++++++-----
 .github/workflows/read-configuration.yml |  8 +++--
 .github/workflows/torch-extras.yml       |  8 +++--
 .github/workflows/torch-nightly.yml      |  6 +++-
 4 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7301747e..065233e3 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -33,21 +33,34 @@ on:
 jobs:
   build:
     name: Build Images
-    runs-on: [ self-hosted, Linux ]
+    runs-on: [ cw ]
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
+    defaults:
+      run:
+        shell: bash
     outputs:
       outcome: ${{ steps.docker-build.outcome }}
       tags: ${{ steps.meta.outputs.tags }}
       version: ${{ steps.meta.outputs.version }}
     steps:
       - uses: actions/checkout@v4
+      - name: Fetch BuildKit Client Certs
+        uses: dopplerhq/secrets-fetch-action@v1.2.0
+        id: client-certs
+        with:
+          doppler-token: ${{ secrets.ORG_BUILDKIT_CLIENT_TOKEN }}
+          doppler-project: ${{ secrets.BUILDKIT_CONSUMER_DOPPLER_PROJECT }}
+          doppler-config: prod
+          inject-env-vars: false
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3.7.1
-      - name: Login to GitHub container registry
-        uses: docker/login-action@v2.2.0
         with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
+          driver: remote
+          endpoint: ${{ secrets.BUILDKIT_CONSUMER_ENDPOINT }}
+        env:
+          BUILDER_NODE_0_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
+          BUILDER_NODE_0_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
+          BUILDER_NODE_0_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
       - name: Get base registry
         run: |
           echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV
@@ -70,6 +83,13 @@ jobs:
           images: ${{ env.REGISTRY }}/${{ inputs.image-name }}
           tags: |
             type=sha,prefix=${{ env.TAG_PREFIX }},suffix=${{ env.TAG_SUFFIX }},format=short
+      - name: Initialize registry credentials file
+        env:
+          USER: ${{ github.actor }}
+          PASS: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          jq -n '.auths."ghcr.io" = { username: env.USER, password: env.PASS }' \
+          | install -m400 /dev/stdin ~/.docker/config.json
       - name: Build and push Docker image
         id: docker-build
         uses: docker/build-push-action@v6.9.0
@@ -77,12 +97,16 @@ jobs:
           context: ${{ inputs.folder }}
           build-args: |-
             ${{ inputs.build-args }}
-          push: true
+          push: false
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
           cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
           platforms: linux/amd64
+      - name: Clear registry credentials
+        if: always()
+        run: |
+          rm -f ~/.docker/config.json && [ ! -e ~/.docker/config.json ]
       - uses: 8BitJonny/gh-get-current-pr@2.1.3
         id: PR
         with:
diff --git a/.github/workflows/read-configuration.yml b/.github/workflows/read-configuration.yml
index 12a21b31..25f5de3e 100644
--- a/.github/workflows/read-configuration.yml
+++ b/.github/workflows/read-configuration.yml
@@ -17,12 +17,16 @@ on:
 jobs:
   read-file:
     name: Read Configuration File
-    runs-on: ["self-hosted", "Linux"]
+    runs-on: [ cw ]
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
+    defaults:
+      run:
+        shell: bash
     permissions: {}
     outputs:
       config: ${{ steps.read.outputs.contents }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Read configuration
         id: read
         env:
diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml
index ca7134ed..e37a6c18 100644
--- a/.github/workflows/torch-extras.yml
+++ b/.github/workflows/torch-extras.yml
@@ -51,13 +51,17 @@ jobs:
   get-required-bases:
     name: Get Latest Required Base Images
     if: inputs.skip-bases-check != true
-    runs-on: ["self-hosted", "Linux"]
+    runs-on: [ cw ]
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
+    defaults:
+      run:
+        shell: bash
     permissions:
       packages: read
     outputs:
       bases-list: ${{ steps.choose-bases.outputs.list }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Check if torch-extras needs to be rebuilt from previous bases
diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
index bdf4a1b8..4c3a4b30 100644
--- a/.github/workflows/torch-nightly.yml
+++ b/.github/workflows/torch-nightly.yml
@@ -19,7 +19,11 @@ jobs:
   get-nightly-info:
     name:
       Get Nightly Info
-    runs-on: [ self-hosted, Linux ]
+    runs-on: [ cw ]
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
+    defaults:
+      run:
+        shell: bash
     outputs:
       pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }}
       triton-commit: ${{ steps.get-hash.outputs.triton-commit }}

From a3f1d783c80a6cec0964744e9ae162ce7a1fea2b Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 6 Nov 2024 17:00:19 -0600
Subject: [PATCH 06/94] ci(torch-nightly): Only filter specific fields from
 configs' `include`

---
 .github/workflows/torch-nightly.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
index 4c3a4b30..5841531c 100644
--- a/.github/workflows/torch-nightly.yml
+++ b/.github/workflows/torch-nightly.yml
@@ -93,13 +93,13 @@ jobs:
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-base.yml
-      filter: del(.include)
+      filter: '.include = ( .include | del(.torch, .vision, .audio) )'
   get-nccl-config:
     name: Get torch:nccl Config
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-nccl.yml
-      filter: del(.include)
+      filter: '.include = ( .include | del(.torch, .vision, .audio) )'
 
   build-base:
     name: Build Nightly torch:base

From e6bb688b043f0079cb8a3db750c07d46cfd7216b Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 6 Nov 2024 17:07:55 -0600
Subject: [PATCH 07/94] ci(torch-nightly): Change `del()` syntax in `yq` filter

---
 .github/workflows/torch-nightly.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
index 5841531c..3ec7de14 100644
--- a/.github/workflows/torch-nightly.yml
+++ b/.github/workflows/torch-nightly.yml
@@ -93,13 +93,13 @@ jobs:
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-base.yml
-      filter: '.include = ( .include | del(.torch, .vision, .audio) )'
+      filter: 'del( .include | ( .torch, .vision, .audio ) )'
   get-nccl-config:
     name: Get torch:nccl Config
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-nccl.yml
-      filter: '.include = ( .include | del(.torch, .vision, .audio) )'
+      filter: 'del( .include | ( .torch, .vision, .audio ) )'
 
   build-base:
     name: Build Nightly torch:base

From 4fe66e299e3e6107ef0a6ebeee93812232626a8d Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 6 Nov 2024 17:11:23 -0600
Subject: [PATCH 08/94] ci(torch-nightly): Treat `include` as an array in `yq`
 filter

---
 .github/workflows/torch-nightly.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
index 3ec7de14..25f8be2a 100644
--- a/.github/workflows/torch-nightly.yml
+++ b/.github/workflows/torch-nightly.yml
@@ -93,13 +93,13 @@ jobs:
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-base.yml
-      filter: 'del( .include | ( .torch, .vision, .audio ) )'
+      filter: 'del( .include[] | ( .torch, .vision, .audio ) )'
   get-nccl-config:
     name: Get torch:nccl Config
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-nccl.yml
-      filter: 'del( .include | ( .torch, .vision, .audio ) )'
+      filter: 'del( .include[] | ( .torch, .vision, .audio ) )'
 
   build-base:
     name: Build Nightly torch:base

From 1c31940b9ef6027c315c8d4114bfb49a55908803 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 6 Nov 2024 17:44:13 -0600
Subject: [PATCH 09/94] ci(torch-nightly): Exclude `ubuntu20.04` from
 `torch-nightly` builds

---
 .github/workflows/torch-nightly.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
index 25f8be2a..8d700747 100644
--- a/.github/workflows/torch-nightly.yml
+++ b/.github/workflows/torch-nightly.yml
@@ -93,13 +93,13 @@ jobs:
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-base.yml
-      filter: 'del( .include[] | ( .torch, .vision, .audio ) )'
+      filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"os": "ubuntu20.04"}]'
   get-nccl-config:
     name: Get torch:nccl Config
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-nccl.yml
-      filter: 'del( .include[] | ( .torch, .vision, .audio ) )'
+      filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"os": "ubuntu20.04"}]'
 
   build-base:
     name: Build Nightly torch:base

From 092a837e17189302b03c46cd6e9d592d012764af Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 6 Nov 2024 17:49:37 -0600
Subject: [PATCH 10/94] ci(torch-nightly): Filter out entire `include` key for
 `torch:base`

---
 .github/workflows/torch-nightly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
index 8d700747..fb51869f 100644
--- a/.github/workflows/torch-nightly.yml
+++ b/.github/workflows/torch-nightly.yml
@@ -93,7 +93,7 @@ jobs:
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-base.yml
-      filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"os": "ubuntu20.04"}]'
+      filter: 'del(.include) | .exclude |= . + [{"os": "ubuntu20.04"}]'
   get-nccl-config:
     name: Get torch:nccl Config
     uses: ./.github/workflows/read-configuration.yml

From 3448307494403ff9915af7f78ce93fa1a40d276b Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 6 Nov 2024 23:03:21 -0600
Subject: [PATCH 11/94] ci: Build for multiple architectures

[skip ci]
---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 065233e3..3f770eff 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -97,12 +97,12 @@ jobs:
           context: ${{ inputs.folder }}
           build-args: |-
             ${{ inputs.build-args }}
-          push: false
+          push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
           cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
-          platforms: linux/amd64
+          platforms: linux/amd64,linux/arm64
       - name: Clear registry credentials
         if: always()
         run: |

From 3b6b17d32646bdd75bd29bb7dd67f9865c7fe538 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 6 Nov 2024 23:08:28 -0600
Subject: [PATCH 12/94] ci: Build only for `linux/amd64`

[skip ci]
---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 3f770eff..79191ce7 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -102,7 +102,7 @@ jobs:
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
           cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
-          platforms: linux/amd64,linux/arm64
+          platforms: linux/amd64
       - name: Clear registry credentials
         if: always()
         run: |

From 35c959b9675fbbb78543e3f7553e6c1a1e554b03 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 13 Nov 2024 13:35:01 -0600
Subject: [PATCH 13/94] fix(torch): Include a post-v2.5.1 bugfix patch when
 building PyTorch

---
 torch/Dockerfile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 24f7f1f2..a5bb6f65 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -46,7 +46,12 @@ RUN chmod 755 /git/clone.sh
 
 FROM downloader-base as pytorch-downloader
 ARG BUILD_TORCH_VERSION
+# Includes a patch for a foreach bug in PyTorch v2.5.1
 RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \
+    if [ "${BUILD_TORCH_VERSION}" = '2.5.1' ]; then \
+      wget 'https://github.com/pytorch/pytorch/commit/1cdaf1d85f5e4b3f8952fd0737a1afeb16995d13.patch' -qO- \
+      | git -C pytorch apply; \
+    fi && \
     rm -rf pytorch/.git
 
 FROM downloader-base as torchvision-downloader

From cdafd6bc63de5c0c4914e9eb501db4dd137464cc Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 03:04:46 -0600
Subject: [PATCH 14/94] feat(torch): Parameterize `compiler_wrapper.f95`

[skip ci]
---
 torch-extras/compiler_wrapper.f95 | 32 ++++++++++++++++++++++++-------
 torch/compiler_wrapper.f95        | 32 ++++++++++++++++++++++++-------
 2 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/torch-extras/compiler_wrapper.f95 b/torch-extras/compiler_wrapper.f95
index f8c13bd2..cbdc602e 100644
--- a/torch-extras/compiler_wrapper.f95
+++ b/torch-extras/compiler_wrapper.f95
@@ -1,13 +1,25 @@
+#ifndef WRAPPER_NATIVE
+#define WRAPPER_NATIVE "skylake"
+#endif
+
+#ifndef WRAPPER_CC
+#define WRAPPER_CC "gcc"
+#endif
+
+#ifndef WRAPPER_AVX
+#define WRAPPER_AVX "AVX256"
+#endif
+
 PROGRAM compiler_wrapper
-    ! Wraps GCC invocations,
-    ! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions
-    ! with -D__AVX256__, and -march=native with -march=skylake,
+    ! Wraps C compiler invocations,
+    ! replacing -D__AVX512__, -D__AVX256__, and -D__SCALAR__ preprocessor definitions
+    ! with -D__<WRAPPER_AVX>__, and -march=native with -march=<WRAPPER_NATIVE>,
     ! for better reproducibility and compatibility.
     IMPLICIT NONE
     INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0
     CHARACTER(len=:), ALLOCATABLE :: arg, command
     ALLOCATE(CHARACTER(len=128) :: arg)
-    command = "gcc"
+    command = WRAPPER_CC
 
     DO i = 1, COMMAND_ARGUMENT_COUNT()
         DO
@@ -22,9 +34,15 @@ PROGRAM compiler_wrapper
             END IF
         END DO
         IF (arg == "-march=native") THEN
-            command = command // " '-march=skylake'"
-        ELSE IF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN
-            command = command // " '-D__AVX256__'"
+            command = command // (" '-march=" // WRAPPER_NATIVE // "'")
+        ELSE IF ( &
+            arg == "-D__AVX512__" &
+            .OR. arg == "-D__AVX256__" &
+            .OR. arg == "-D__SCALAR__" &
+        ) THEN
+#ifndef WRAPPER_NO_AVX
+            command = command // (" '-D__" // WRAPPER_AVX // "__'")
+#endif
         ELSE
             command = command // shell_escaped(arg)
         END IF
diff --git a/torch/compiler_wrapper.f95 b/torch/compiler_wrapper.f95
index f8c13bd2..cbdc602e 100644
--- a/torch/compiler_wrapper.f95
+++ b/torch/compiler_wrapper.f95
@@ -1,13 +1,25 @@
+#ifndef WRAPPER_NATIVE
+#define WRAPPER_NATIVE "skylake"
+#endif
+
+#ifndef WRAPPER_CC
+#define WRAPPER_CC "gcc"
+#endif
+
+#ifndef WRAPPER_AVX
+#define WRAPPER_AVX "AVX256"
+#endif
+
 PROGRAM compiler_wrapper
-    ! Wraps GCC invocations,
-    ! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions
-    ! with -D__AVX256__, and -march=native with -march=skylake,
+    ! Wraps C compiler invocations,
+    ! replacing -D__AVX512__, -D__AVX256__, and -D__SCALAR__ preprocessor definitions
+    ! with -D__<WRAPPER_AVX>__, and -march=native with -march=<WRAPPER_NATIVE>,
     ! for better reproducibility and compatibility.
     IMPLICIT NONE
     INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0
     CHARACTER(len=:), ALLOCATABLE :: arg, command
     ALLOCATE(CHARACTER(len=128) :: arg)
-    command = "gcc"
+    command = WRAPPER_CC
 
     DO i = 1, COMMAND_ARGUMENT_COUNT()
         DO
@@ -22,9 +34,15 @@ PROGRAM compiler_wrapper
             END IF
         END DO
         IF (arg == "-march=native") THEN
-            command = command // " '-march=skylake'"
-        ELSE IF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN
-            command = command // " '-D__AVX256__'"
+            command = command // (" '-march=" // WRAPPER_NATIVE // "'")
+        ELSE IF ( &
+            arg == "-D__AVX512__" &
+            .OR. arg == "-D__AVX256__" &
+            .OR. arg == "-D__SCALAR__" &
+        ) THEN
+#ifndef WRAPPER_NO_AVX
+            command = command // (" '-D__" // WRAPPER_AVX // "__'")
+#endif
         ELSE
             command = command // shell_escaped(arg)
         END IF

From e0bd93a07629cb2c6e01627bfd38eb4b9e2c94aa Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 03:07:15 -0600
Subject: [PATCH 15/94] fix(torch): Enable preprocessor when compiling
 `compiler_wrapper.f95`

[skip ci]
---
 torch-extras/Dockerfile | 2 +-
 torch/Dockerfile        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 3c785f0f..b67aea1c 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -80,7 +80,7 @@ WORKDIR /build
 # The compiler wrapper normalizes -march=native to -march=skylake
 # along with a couple other transformations before invoking GCC.
 COPY compiler_wrapper.f95 .
-RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
+RUN gfortran -cpp -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
 
 COPY --chmod=755 effective_cpu_count.sh .
 COPY --chmod=755 scale.sh .
diff --git a/torch/Dockerfile b/torch/Dockerfile
index a5bb6f65..185ecee4 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -241,7 +241,7 @@ WORKDIR /build
 COPY --chmod=755 effective_cpu_count.sh .
 COPY --chmod=755 scale.sh .
 COPY compiler_wrapper.f95 .
-RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
+RUN gfortran -cpp -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
 
 COPY <<-"EOT" /build/version-string.sh
     #!/bin/sh

From 074edd5d82ea0e672122ff4dd29bc1b6559bf452 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 03:34:54 -0600
Subject: [PATCH 16/94] build(torch): Make the build process less
 architecture-dependent

[skip ci]
---
 torch/Dockerfile | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 185ecee4..dfb99727 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -147,7 +147,7 @@ RUN apt-get -qq update && apt-get -qq install -y \
     /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
-    ln -s libomp.so.5 /usr/lib/x86_64-linux-gnu/libomp.so && \
+    ln -s libomp.so.5 "/usr/lib/$(gcc -print-multiarch)/libomp.so" && \
     ldconfig
 
 COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh
@@ -167,10 +167,14 @@ RUN export \
 
 RUN mkdir /tmp/ccache-install && \
     cd /tmp/ccache-install && \
-    CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2-linux-x86_64.tar.xz' && \
-    wget -qO - $CCACHE_URL | tar --strip-components 1 -xJf - && \
+    CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2.tar.xz' && \
+    wget -qO - "$CCACHE_URL" | tar --strip-components 1 -xJf - && \
+    mkdir build && \
+    cd build && \
+    cmake -B. -S.. -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build . --config Release && \
     make install && \
-    cd .. && \
+    cd ../.. && \
     rm -rf /tmp/ccache-install && \
     ccache -M "${BUILD_CCACHE_SIZE}" && \
     ccache -F 0
@@ -501,7 +505,7 @@ RUN apt-get -qq update && apt-get -qq install -y \
     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
     update-alternatives --install /usr/bin/vim vim /usr/bin/vim.tiny 1 && \
-    ln -s libomp.so.5 /usr/lib/x86_64-linux-gnu/libomp.so && \
+    ln -s libomp.so.5 "/usr/lib/$(gcc -print-multiarch)/libomp.so" && \
     ldconfig
 
 RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \

From dc97cd6a45d2c4ee1a541d0f9346c72b0311b941 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 03:36:25 -0600
Subject: [PATCH 17/94] fix(torch): Use `cmake` after installing it instead of
 before

[skip ci]
---
 torch/Dockerfile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index dfb99727..2b7c04df 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -165,6 +165,14 @@ RUN export \
     rm /tmp/install_cudnn.sh && \
     apt-get clean
 
+# Add Kitware's apt repository to get a newer version of CMake
+RUN apt-get -qq update && apt-get -qq install -y \
+      software-properties-common lsb-release && \
+    { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+    | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \
+    apt-add-repository -n "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
+    apt-get -qq update && apt-get -qq install -y cmake && apt-get clean
+
 RUN mkdir /tmp/ccache-install && \
     cd /tmp/ccache-install && \
     CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2.tar.xz' && \
@@ -185,14 +193,6 @@ ENV CCACHE_DIR=/ccache \
     CMAKE_CXX_COMPILER_LAUNCHER=ccache \
     CMAKE_CUDA_COMPILER_LAUNCHER=ccache
 
-# Add Kitware's apt repository to get a newer version of CMake
-RUN apt-get -qq update && apt-get -qq install -y \
-      software-properties-common lsb-release && \
-    { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \
-    | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \
-    apt-add-repository -n "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
-    apt-get -qq update && apt-get -qq install -y cmake && apt-get clean
-
 # Update compiler (GCC) and linker (LLD) versions
 RUN CODENAME="$(lsb_release -cs)" && \
     wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \

From 85f3b0ea38eae8a5463ff7f1880f872036e8d25e Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 03:44:42 -0600
Subject: [PATCH 18/94] build(torch): Allow customizing `-march` with
 `--build-arg`s

[skip ci]
---
 torch/Dockerfile | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 2b7c04df..d8090574 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -245,7 +245,16 @@ WORKDIR /build
 COPY --chmod=755 effective_cpu_count.sh .
 COPY --chmod=755 scale.sh .
 COPY compiler_wrapper.f95 .
-RUN gfortran -cpp -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
+ARG AMD64_NATIVE_ARCH="skylake"
+ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres"
+RUN if [ "$(uname -m)" = "aarch64" ]; then \
+      NATIVE="${ARM64_NATIVE_ARCH}" && \
+      AVX='WRAPPER_NO_AVX'; \
+    else \
+      NATIVE="${AMD64_NATIVE_ARCH}" && \
+      AVX='WRAPPER_AVX=AVX256'; \
+    fi && \
+    gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
 
 COPY <<-"EOT" /build/version-string.sh
     #!/bin/sh

From 68121aa18427b4259b92ab2509545a92de1805c1 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 03:53:28 -0600
Subject: [PATCH 19/94] build(torch): Allow customizing `MAX_JOBS` as a build
 arg

[skip ci]
---
 torch/Dockerfile | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index d8090574..c2767668 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -338,9 +338,10 @@ ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
 # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271).
 # Without WITH_BLAS, it would detect the BLAS implementation as
 # BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either.
+ARG BUILD_MAX_JOBS=""
 RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \
     --mount=type=cache,target=/ccache \
-    export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
+    export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
     ./storage-info.sh . && \
     cd pytorch && \
@@ -388,7 +389,8 @@ RUN pip3 install --no-cache-dir --upgrade \
 
 RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=vision/,rw \
     --mount=type=cache,target=/ccache \
-    export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
+    export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
+    echo "MAX_JOBS: ${MAX_JOBS}" && \
     cd vision && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
@@ -425,7 +427,8 @@ RUN pip3 install --no-cache-dir --upgrade \
 
 RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/,rw \
     --mount=type=cache,target=/ccache \
-    export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
+    export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
+    echo "MAX_JOBS: ${MAX_JOBS}" && \
     cd audio && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
@@ -457,6 +460,8 @@ RUN rm ./dist/*
 # Build TransformerEngine
 RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerEngine,target=TransformerEngine/,rw \
     export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \
+    export MAX_JOBS="${BUILD_MAX_JOBS:-MAX_JOBS}" && \
+    echo "MAX_JOBS: ${MAX_JOBS}" && \
     cd TransformerEngine && \
     if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \
       sed -i "s/from functools import cache/from functools import lru_cache as cache/g" \
@@ -470,9 +475,10 @@ RUN rm ./dist/*
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \
     export CC=$(realpath -e ./compiler) \
-      MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)" \
+      MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)}" \
       PYTHONUNBUFFERED=1 \
       FLASH_ATTENTION_FORCE_BUILD='TRUE' && \
+    echo "MAX_JOBS: ${MAX_JOBS}" && \
     cd flash-attention && \
     ( \
       for EXT_DIR in $(realpath -s -e \

From d35022bf5045924a507234c8cb96504716ab18d6 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 04:07:33 -0600
Subject: [PATCH 20/94] build(torch): Don't apply custom `MAX_JOBS` to
 `flash-attn` build

[skip ci]
---
 torch/Dockerfile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index c2767668..bdc2d747 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -370,7 +370,8 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
     PYTORCH_BUILD_VERSION="$(../version-string.sh "$TORCH_VERSION")" \
     PYTORCH_BUILD_NUMBER=0 \
     TORCH_NVCC_FLAGS="-Xfatbin -compress-all -diag-suppress 191,186,177" \
-    python3 setup.py bdist_wheel --dist-dir ../dist
+    python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \
+    | grep -v '^ptxas /tmp/'
 RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl
 
 ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177"
@@ -475,7 +476,7 @@ RUN rm ./dist/*
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \
     export CC=$(realpath -e ./compiler) \
-      MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)}" \
+      MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)" \
       PYTHONUNBUFFERED=1 \
       FLASH_ATTENTION_FORCE_BUILD='TRUE' && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \

From 8305f7fb306e6271f0302a0ab9f83300754c487b Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 04:10:20 -0600
Subject: [PATCH 21/94] build(torch): Line-buffer `grep` output when building
 PyTorch

[skip ci]
---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index bdc2d747..251673e9 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -371,7 +371,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
     PYTORCH_BUILD_NUMBER=0 \
     TORCH_NVCC_FLAGS="-Xfatbin -compress-all -diag-suppress 191,186,177" \
     python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \
-    | grep -v '^ptxas /tmp/'
+    | grep -v --line-buffered '^ptxas /tmp/'
 RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl
 
 ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177"

From daa5a4b9cc9ca1f23d9652c3c6573db0956595db Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 04:13:27 -0600
Subject: [PATCH 22/94] build(torch): Filter more output when building PyTorch

[skip ci]
---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 251673e9..5cf4792f 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -371,7 +371,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
     PYTORCH_BUILD_NUMBER=0 \
     TORCH_NVCC_FLAGS="-Xfatbin -compress-all -diag-suppress 191,186,177" \
     python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \
-    | grep -v --line-buffered '^ptxas /tmp/'
+    | grep -Ev --line-buffered '^(ptxas /tmp/|copying torch/|creating build/)'
 RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl
 
 ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177"

From c940ad4c298bbb553dd27a021e74e2d59ee7ab3e Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 04:27:20 -0600
Subject: [PATCH 23/94] build(torch): Allow customizing TransformerEngine build
 arches

---
 torch/Dockerfile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 5cf4792f..857b5234 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -10,6 +10,7 @@ ARG BUILD_FLASH_ATTN_VERSION="2.6.3"
 ARG BUILD_TRITON_VERSION=""
 ARG BUILD_TRITON="1"
 ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX"
+ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST="70;80;89;90"
 
 # 8.7 is supported in the PyTorch main branch, but not 2.0.0
 
@@ -459,6 +460,9 @@ FROM builder-base as transformerengine-builder
 RUN rm ./dist/*
 
 # Build TransformerEngine
+ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST
+ENV NVTE_CUDA_ARCHS=$BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST
+
 RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerEngine,target=TransformerEngine/,rw \
     export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \
     export MAX_JOBS="${BUILD_MAX_JOBS:-MAX_JOBS}" && \

From 863fca5680066de747d9f8f0a706137006d48f14 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 04:28:49 -0600
Subject: [PATCH 24/94] feat(torch): Add `flash-attn` 3 beta

[skip ci]
---
 torch/Dockerfile | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 857b5234..ca2648a2 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -7,6 +7,7 @@ ARG BUILD_TORCH_VISION_VERSION="0.20.0"
 ARG BUILD_TORCH_AUDIO_VERSION="2.5.0"
 ARG BUILD_TRANSFORMERENGINE_VERSION="1.11"
 ARG BUILD_FLASH_ATTN_VERSION="2.6.3"
+ARG BUILD_FLASH_ATTN_3="1"
 ARG BUILD_TRITON_VERSION=""
 ARG BUILD_TRITON="1"
 ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX"
@@ -477,6 +478,7 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE
 FROM builder-base as flash-attn-builder
 RUN rm ./dist/*
 
+ARG BUILD_FLASH_ATTN_3
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \
     export CC=$(realpath -e ./compiler) \
@@ -485,9 +487,15 @@ RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,tar
       FLASH_ATTENTION_FORCE_BUILD='TRUE' && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
     cd flash-attention && \
+    if [ "$BUILD_FLASH_ATTN_3" = 1 ]; then \
+      FA3_DIR="hopper"; \
+    else \
+      FA3_DIR=""; \
+    fi && \
     ( \
       for EXT_DIR in $(realpath -s -e \
         . \
+        $FA3_DIR \
         csrc/ft_attention \
         csrc/fused_dense_lib \
         csrc/fused_softmax \

From b4ce2dac58298bc73ca34bf1277848729c08e4a2 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 06:08:27 -0600
Subject: [PATCH 25/94] build(torch): Filter even more output when building
 PyTorch

---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index ca2648a2..73cbb18a 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -373,7 +373,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
     PYTORCH_BUILD_NUMBER=0 \
     TORCH_NVCC_FLAGS="-Xfatbin -compress-all -diag-suppress 191,186,177" \
     python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \
-    | grep -Ev --line-buffered '^(ptxas /tmp/|copying torch/|creating build/)'
+    | grep -Ev --line-buffered '^(ptxas /tmp/|copying .+/|creating build/)'
 RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl
 
 ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177"

From 87c22ff437c7731daae21e8344207f6b6ccc32a0 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 06:09:11 -0600
Subject: [PATCH 26/94] build(torch-extras): Configure `compiler_wrapper.f95`
 parameters

---
 torch-extras/Dockerfile | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index b67aea1c..456420f3 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -80,7 +80,16 @@ WORKDIR /build
 # The compiler wrapper normalizes -march=native to -march=skylake
 # along with a couple other transformations before invoking GCC.
 COPY compiler_wrapper.f95 .
-RUN gfortran -cpp -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
+ARG AMD64_NATIVE_ARCH="skylake"
+ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres"
+RUN if [ "$(uname -m)" = "aarch64" ]; then \
+      NATIVE="${ARM64_NATIVE_ARCH}" && \
+      AVX='WRAPPER_NO_AVX'; \
+    else \
+      NATIVE="${AMD64_NATIVE_ARCH}" && \
+      AVX='WRAPPER_AVX=AVX256'; \
+    fi && \
+    gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
 
 COPY --chmod=755 effective_cpu_count.sh .
 COPY --chmod=755 scale.sh .

From 9cb44f4b075cdf7f6d625cb30d5019749e9713fd Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 06:11:10 -0600
Subject: [PATCH 27/94] build(torch-extras): Allow overriding `MAX_JOBS` and
 `NVCC_APPEND_FLAGS`

[skip ci]
---
 torch-extras/Dockerfile | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 456420f3..88b4029d 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -4,6 +4,7 @@ ARG BASE_IMAGE
 ARG DEEPSPEED_VERSION="0.14.4"
 ARG APEX_COMMIT="23c1f86520e22b505e8fdfcf6298273dff2d93d8"
 ARG XFORMERS_VERSION="0.0.28.post1"
+ARG BUILD_MAX_JOBS=""
 
 FROM alpine/git:2.36.3 as apex-downloader
 WORKDIR /git
@@ -94,6 +95,10 @@ RUN if [ "$(uname -m)" = "aarch64" ]; then \
 COPY --chmod=755 effective_cpu_count.sh .
 COPY --chmod=755 scale.sh .
 
+ARG BUILD_NVCC_APPEND_FLAGS=""
+ENV NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS:$BUILD_NVCC_APPEND_FLAGS }-diag-suppress 186,177"
+ARG BUILD_MAX_JOBS
+
 
 FROM builder-base as deepspeed-builder
 # DeepSpeed build flags
@@ -153,7 +158,7 @@ RUN python3 -m pip install -U --no-cache-dir \
       do if [[ -z ${!VAR} ]]; then unset ${VAR}; fi; done; \
     } && \
     CC=$(realpath -e ./compiler) \
-      MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)" \
+      MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)}" \
       python3 -m pip wheel -w /wheels \
       --no-cache-dir --no-build-isolation --no-deps \
       deepspeed==${DEEPSPEED_VERSION} && \
@@ -187,8 +192,7 @@ RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \
       :; \
     )" && \
     export CC=$(realpath -e ./compiler) && \
-    export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 24)" && \
-    export NVCC_APPEND_FLAGS='-diag-suppress 186,177' && \
+    export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 24)}" && \
     printf -- '--config-settings="--build-option=%s" ' $( \
       echo \
         --cpp_ext \
@@ -232,7 +236,6 @@ RUN python3 -m pip install -U --no-cache-dir \
     CC=$(realpath -e ./compiler) \
       MAX_JOBS=1 \
       PYTHONUNBUFFERED=1 \
-      NVCC_APPEND_FLAGS='-diag-suppress 186,177' \
       XFORMERS_DISABLE_FLASH_ATTN=1 \
       python3 -m pip wheel -w /wheels -v \
       --no-cache-dir --no-build-isolation --no-deps \

From 4a6f2a769ec1c82b044e3463a9d65cbe1b1ada95 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 06:13:18 -0600
Subject: [PATCH 28/94] fix(torch-extras): Add missing `+` in parameter
 expansion

[skip ci]
---
 torch-extras/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 88b4029d..96866ddd 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -96,7 +96,7 @@ COPY --chmod=755 effective_cpu_count.sh .
 COPY --chmod=755 scale.sh .
 
 ARG BUILD_NVCC_APPEND_FLAGS=""
-ENV NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS:$BUILD_NVCC_APPEND_FLAGS }-diag-suppress 186,177"
+ENV NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS:+$BUILD_NVCC_APPEND_FLAGS }-diag-suppress 186,177"
 ARG BUILD_MAX_JOBS
 
 

From 1dfcbc118cd2b51a46600114915daf553313e9ad Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 06:23:27 -0600
Subject: [PATCH 29/94] feat(torch-extras): Build DeepSpeed-Kernels

[skip ci]
---
 torch-extras/Dockerfile | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 96866ddd..d7af221c 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -3,6 +3,7 @@
 ARG BASE_IMAGE
 ARG DEEPSPEED_VERSION="0.14.4"
 ARG APEX_COMMIT="23c1f86520e22b505e8fdfcf6298273dff2d93d8"
+ARG DEEPSPEED_KERNELS_COMMIT="e77acc40b104696d4e73229b787d1ef29a9685b1"
 ARG XFORMERS_VERSION="0.0.28.post1"
 ARG BUILD_MAX_JOBS=""
 
@@ -17,6 +18,18 @@ RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \
       --depth 1 --filter=blob:none && \
     find -type d -name docs -prune -exec rm -r '{}' ';'
 
+
+FROM alpine/git:2.36.3 as ds-kernels-downloader
+WORKDIR /git
+ARG DEEPSPEED_KERNELS_COMMIT
+RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \
+      https://github.com/microsoft/DeepSpeed-Kernels ds-kernels && \
+    cd ds-kernels && \
+    git checkout "${DEEPSPEED_KERNELS_COMMIT}" && \
+    git submodule update --init --recursive --jobs 8 \
+      --depth 1 --filter=blob:none
+
+
 # Dependencies requiring NVCC are built ahead of time in a separate stage
 # so that the ~2 GiB dev library installations don't have to be included
 # in the final image.
@@ -101,6 +114,15 @@ ARG BUILD_MAX_JOBS
 
 
 FROM builder-base as deepspeed-builder
+
+RUN --mount=type=bind,from=ds-kernels-downloader,source=/git/ds-kernels,target=ds-kernels/,rw \
+    cd ds-kernels && \
+    export CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" && \
+    echo "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}" && \
+    python3 -m pip wheel -w /wheels \
+      --no-cache-dir --no-build-isolation --no-deps . && \
+    python3 -m pip install /wheels/*.whl
+
 # DeepSpeed build flags
 # See: https://www.deepspeed.ai/tutorials/advanced-install
 ARG DS_BUILD_OPS="1"

From 8a44533fa96c4713b70d726eddd0901c91b6f1c4 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 06:27:34 -0600
Subject: [PATCH 30/94] fix(torch-extras): Use separate build argument for
 DS-Kernels arches

[skip ci]
---
 torch-extras/Dockerfile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index d7af221c..dfe0b05c 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -4,6 +4,7 @@ ARG BASE_IMAGE
 ARG DEEPSPEED_VERSION="0.14.4"
 ARG APEX_COMMIT="23c1f86520e22b505e8fdfcf6298273dff2d93d8"
 ARG DEEPSPEED_KERNELS_COMMIT="e77acc40b104696d4e73229b787d1ef29a9685b1"
+ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST="80;86;89;90"
 ARG XFORMERS_VERSION="0.0.28.post1"
 ARG BUILD_MAX_JOBS=""
 
@@ -115,9 +116,10 @@ ARG BUILD_MAX_JOBS
 
 FROM builder-base as deepspeed-builder
 
+ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST
 RUN --mount=type=bind,from=ds-kernels-downloader,source=/git/ds-kernels,target=ds-kernels/,rw \
     cd ds-kernels && \
-    export CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" && \
+    export CUDA_ARCH_LIST="${DEEPSPEED_KERNELS_CUDA_ARCH_LIST}" && \
     echo "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}" && \
     python3 -m pip wheel -w /wheels \
       --no-cache-dir --no-build-isolation --no-deps . && \

From a3e444d7f1f4ffb9dc6de12165c8013dd936fc09 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 06:34:14 -0600
Subject: [PATCH 31/94] build(torch-extras): Install `py-cpuinfo` before
 building DeepSpeed

[skip ci]
---
 torch-extras/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index dfe0b05c..552b7888 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -151,7 +151,7 @@ ARG DEEPSPEED_VERSION
 
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 RUN python3 -m pip install -U --no-cache-dir \
-      setuptools wheel pip deepspeed-kernels && \
+      setuptools wheel pip deepspeed-kernels py-cpuinfo && \
     if python3 -m pip show torch | grep 'Version: 2\.[1-9]' > /dev/null; then \
       # DeepSpeed's AIO extension is incompatible with PyTorch 2.1.x's
       # requirement for C++17 (as of DeepSpeed 0.10.1).

From cc76549895d5c4b00fa6298d6f0904013e8d7374 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 13:17:07 -0600
Subject: [PATCH 32/94] build(torch): Allow setting `MAX_JOBS` when building
 Triton

---
 torch/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 73cbb18a..cd06d352 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -300,10 +300,11 @@ ENV CMAKE_PREFIX_PATH=/usr/bin/ \
     CUDNN_LIB_DIR=/usr/local/cuda/lib64
 
 ARG BUILD_TRITON
+ARG BUILD_MAX_JOBS=""
 RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \
     --mount=type=cache,target=/ccache \
     if [ "$BUILD_TRITON" = '1' ]; then \
-      export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
+      export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
       cd triton/python && \
       python3 -m pip wheel -w ../../dist/ --no-build-isolation --no-deps -vv . && \
       pip3 install ../../dist/*.whl; \
@@ -340,7 +341,6 @@ ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
 # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271).
 # Without WITH_BLAS, it would detect the BLAS implementation as
 # BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either.
-ARG BUILD_MAX_JOBS=""
 RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \
     --mount=type=cache,target=/ccache \
     export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \

From a444c4439c1c1098b33f7b513bcb9ff8ecc28441 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 13:18:56 -0600
Subject: [PATCH 33/94] build(torch): Allow configuring `NVCC_APPEND_FLAGS` as
 a build argument

---
 torch/Dockerfile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index cd06d352..a1dc3370 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -315,6 +315,9 @@ ARG BUILD_TORCH_CUDA_ARCH_LIST
 ENV TORCH_VERSION=$BUILD_TORCH_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
 
+ARG BUILD_NVCC_APPEND_FLAGS=""
+ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BUILD_NVCC_APPEND_FLAGS}"
+
 # If the directory /opt/nccl-tests exists,
 # the base image is assumed to be nccl-tests,
 # so it uses the system's special NCCL and UCC installations for the build.
@@ -371,13 +374,11 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
     WITH_BLAS=FLAME \
     PYTORCH_BUILD_VERSION="$(../version-string.sh "$TORCH_VERSION")" \
     PYTORCH_BUILD_NUMBER=0 \
-    TORCH_NVCC_FLAGS="-Xfatbin -compress-all -diag-suppress 191,186,177" \
+    TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \
     | grep -Ev --line-buffered '^(ptxas /tmp/|copying .+/|creating build/)'
 RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl
 
-ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177"
-
 RUN python3 -m pip install -U --no-cache-dir \
       packaging setuptools wheel pip
 

From a8bbb8a655bd3fa91b60d007b71fddeecc12dadb Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 13:21:54 -0600
Subject: [PATCH 34/94] build(torch-extras): Don't install `cuda-nvprof` before
 building

[skip ci]
---
 torch-extras/Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 552b7888..46747c6a 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -47,7 +47,6 @@ RUN export \
       libcublas-dev-${CUDA_PACKAGE_VERSION} \
       libcusparse-dev-${CUDA_PACKAGE_VERSION} \
       libcusolver-dev-${CUDA_PACKAGE_VERSION} \
-      cuda-nvprof-${CUDA_PACKAGE_VERSION} \
       cuda-profiler-api-${CUDA_PACKAGE_VERSION} \
       cuda-nvtx-${CUDA_PACKAGE_VERSION} \
       cuda-nvrtc-dev-${CUDA_PACKAGE_VERSION} && \

From de9a3de9b395d0904af43b43a27cb5ee967c9a3a Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 13:57:34 -0600
Subject: [PATCH 35/94] build(torch): Conditionally enable
 `USE_PRIORITIZED_TEXT_FOR_LD`

[skip ci]
---
 torch/Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index a1dc3370..d82a23ea 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -354,6 +354,9 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
     ln -s /usr/bin/c++ build/c++ && \
+    if [ "$(uname -m)" = 'aarch64' ]; then \
+      export USE_PRIORITIZED_TEXT_FOR_LD=1; \
+    fi && \
     { if [ -d /opt/nccl-tests ]; then \
       export \
         USE_DISTRIBUTED=1 \

From a889d618b229e6c1f8eae3d52ccd598658b0070d Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 14:21:20 -0600
Subject: [PATCH 36/94] build(torch): Don't use `lld` on `aarch64`

Only standard ld is compatible with USE_PRIORITIZED_TEXT_FOR_LD

[skip ci]
---
 torch/Dockerfile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index d82a23ea..87e0924b 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -209,7 +209,9 @@ RUN CODENAME="$(lsb_release -cs)" && \
     update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \
     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \
     update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \
-    update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1
+    if [ "$(uname -m)" != 'aarch64' ]; then \
+      update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1; \
+    fi
 
 # Install AOCL-BLAS and AOCL-LAPACK
 # See: https://www.amd.com/en/developer/aocl/dense.html

From 86c27ff5e3c55ac7c56a3d6e271aaf25e3290562 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 18 Nov 2024 14:27:32 -0600
Subject: [PATCH 37/94] build(torch): Set `-eo pipefail` for the PyTorch build
 command

[skip ci]
---
 torch/Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 87e0924b..2ee2e244 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -346,6 +346,7 @@ ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BU
 # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271).
 # Without WITH_BLAS, it would detect the BLAS implementation as
 # BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either.
+SHELL ["/bin/bash", "-eo", "pipefail", "-c"]
 RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \
     --mount=type=cache,target=/ccache \
     export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
@@ -382,6 +383,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
     TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \
     | grep -Ev --line-buffered '^(ptxas /tmp/|copying .+/|creating build/)'
+SHELL ["/bin/sh", "-c"]
 RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl
 
 RUN python3 -m pip install -U --no-cache-dir \

From 3b14ffb0cb5c8f0f8f0d9ad5fed5449d561223b2 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 19 Nov 2024 15:14:40 -0600
Subject: [PATCH 38/94] feat(torch): Update TransformerEngine to v1.12

---
 torch/Dockerfile | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 2ee2e244..dea32e35 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -5,7 +5,7 @@ ARG FINAL_BASE_IMAGE="nvidia/cuda:12.4.1-base-ubuntu22.04"
 ARG BUILD_TORCH_VERSION="2.5.1"
 ARG BUILD_TORCH_VISION_VERSION="0.20.0"
 ARG BUILD_TORCH_AUDIO_VERSION="2.5.0"
-ARG BUILD_TRANSFORMERENGINE_VERSION="1.11"
+ARG BUILD_TRANSFORMERENGINE_VERSION="1.12"
 ARG BUILD_FLASH_ATTN_VERSION="2.6.3"
 ARG BUILD_FLASH_ATTN_3="1"
 ARG BUILD_TRITON_VERSION=""
@@ -81,13 +81,6 @@ FROM downloader-base as transformerengine-downloader
 ARG BUILD_TRANSFORMERENGINE_VERSION
 RUN ./clone.sh NVIDIA/TransformerEngine TransformerEngine "${BUILD_TRANSFORMERENGINE_VERSION}"
 
-# Include a patch commit that is sort-of part of v1.11 but isn't in their v1.11 release git tag
-# See https://github.com/NVIDIA/TransformerEngine/pull/1222
-RUN if [ "${BUILD_TRANSFORMERENGINE_VERSION}" = '1.11' ]; then \
-      wget 'https://github.com/NVIDIA/TransformerEngine/commit/fc034785f5e3a5bc5600a88766d9a1d75137ce77.patch' -qO- \
-      | git -C TransformerEngine apply -v --stat --apply -; \
-    fi
-
 FROM downloader-base as flash-attn-downloader
 ARG BUILD_FLASH_ATTN_VERSION
 RUN ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_VERSION}"

From bb07aedbc0de403f6765c420eee20b2738dcd4f1 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 19 Nov 2024 15:15:37 -0600
Subject: [PATCH 39/94] build(torch): Use `ccache` more often

---
 torch/Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index dea32e35..19736455 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -466,6 +466,7 @@ ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST
 ENV NVTE_CUDA_ARCHS=$BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST
 
 RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerEngine,target=TransformerEngine/,rw \
+    --mount=type=cache,target=/ccache \
     export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \
     export MAX_JOBS="${BUILD_MAX_JOBS:-MAX_JOBS}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
@@ -482,6 +483,7 @@ RUN rm ./dist/*
 ARG BUILD_FLASH_ATTN_3
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
 RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \
+    --mount=type=cache,target=/ccache \
     export CC=$(realpath -e ./compiler) \
       MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)" \
       PYTHONUNBUFFERED=1 \

From 65433acd72130f3a663f901a15eda26154ba3ae5 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 19 Nov 2024 15:16:49 -0600
Subject: [PATCH 40/94] feat(torch): Compile `flash-attn` 3 as a separate
 package

[skip ci]
---
 torch/Dockerfile | 77 +++++++++++++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 30 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 19736455..c960dba4 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -7,7 +7,7 @@ ARG BUILD_TORCH_VISION_VERSION="0.20.0"
 ARG BUILD_TORCH_AUDIO_VERSION="2.5.0"
 ARG BUILD_TRANSFORMERENGINE_VERSION="1.12"
 ARG BUILD_FLASH_ATTN_VERSION="2.6.3"
-ARG BUILD_FLASH_ATTN_3="1"
+ARG BUILD_FLASH_ATTN_3_VERSION="2.7.0.post2"
 ARG BUILD_TRITON_VERSION=""
 ARG BUILD_TRITON="1"
 ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX"
@@ -85,6 +85,14 @@ FROM downloader-base as flash-attn-downloader
 ARG BUILD_FLASH_ATTN_VERSION
 RUN ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_VERSION}"
 
+FROM downloader-base as flash-attn-3-downloader
+ARG BUILD_FLASH_ATTN_3_VERSION
+RUN if [ -n "$BUILD_FLASH_ATTN_3_VERSION" ]; then \
+      ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_3_VERSION}"; \
+    else \
+      mkdir flash-attention; \
+    fi
+
 FROM downloader-base as triton-version
 ENV TRITON_COMMIT_FILE='.ci/docker/ci_commit_pins/triton.txt'
 COPY --link --from=pytorch-downloader "/git/pytorch/${TRITON_COMMIT_FILE}" /git/version.txt
@@ -477,43 +485,52 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE
     fi && \
     python3 setup.py bdist_wheel --dist-dir /build/dist
 
-FROM builder-base as flash-attn-builder
+FROM builder-base as flash-attn-builder-base
 RUN rm ./dist/*
+ENV PYTHONUNBUFFERED=1
+ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
+ARG BUILD_FLASH_ATTN_MAX_JOBS=""
+
+COPY <<-"EOT" /build/fa-build.sh
+    #!/bin/bash
+    set -eo pipefail;
+    if [ -n "$1" ]; then cd "$1"; fi;
+    python3 setup.py bdist_wheel --dist-dir /build/dist \
+    | grep -Ev --line-buffered '^ptxas /tmp/|ptxas info\s*:|bytes spill stores'
+EOT
+RUN chmod 755 /build/fa-build.sh
+
+FROM flash-attn-builder-base as flash-attn-builder
 
-ARG BUILD_FLASH_ATTN_3
-SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+# Build flash-attn
 RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \
     --mount=type=cache,target=/ccache \
     export CC=$(realpath -e ./compiler) \
-      MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)" \
-      PYTHONUNBUFFERED=1 \
-      FLASH_ATTENTION_FORCE_BUILD='TRUE' && \
+      MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
     cd flash-attention && \
-    if [ "$BUILD_FLASH_ATTN_3" = 1 ]; then \
-      FA3_DIR="hopper"; \
-    else \
-      FA3_DIR=""; \
+    for EXT_DIR in $(realpath -s -e \
+      . \
+      csrc/ft_attention \
+      csrc/fused_dense_lib \
+      csrc/fused_softmax \
+      csrc/layer_norm \
+      csrc/rotary \
+      csrc/xentropy); \
+    do /build/fa-build.sh; done
+
+FROM flash-attn-builder-base as flash-attn-3-builder
+
+# Build flash-attn v3
+RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,target=flash-attention/,rw \
+    --mount=type=cache,target=/ccache \
+    if [ ! -d flash-attention/hopper ]; then \
+      echo "Not compiling flash-attn v3" && exit 0; \
     fi && \
-    ( \
-      for EXT_DIR in $(realpath -s -e \
-        . \
-        $FA3_DIR \
-        csrc/ft_attention \
-        csrc/fused_dense_lib \
-        csrc/fused_softmax \
-        csrc/layer_norm \
-        csrc/rotary \
-        csrc/xentropy); \
-      do \
-          cd $EXT_DIR && \
-          python3 setup.py bdist_wheel --dist-dir /build/dist && \
-          cd - || \
-          exit 1; \
-      done; \
-    ) | \
-    grep -Ev --line-buffered 'ptxas info\s*:|bytes spill stores'
-SHELL ["/bin/sh", "-c"]
+    export CC=$(realpath -e ./compiler) \
+      MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 10 6)}" && \
+    echo "MAX_JOBS: ${MAX_JOBS}" && \
+    /build/fa-build.sh flash-attention/hopper
 
 FROM builder-base as builder
 COPY --link --from=torchaudio-builder /build/dist/ /build/dist/

From 69be6f1081df2572d676a336778b83d55bc542a8 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 19 Nov 2024 15:39:00 -0600
Subject: [PATCH 41/94] build(torch): Use tabs for heredoc indentation

---
 torch/Dockerfile | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index c960dba4..352a0ae3 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -31,16 +31,16 @@ COPY <<-"EOT" /git/clone.sh
     DEST="$2";
     REF="$3";
 
-    CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; };
-
-    # Try cloning REF as a tag prefixed with "v", otherwise fall back
-    # to git checkout for commit hashes
-    CLONE --recurse-submodules --shallow-submodules --also-filter-submodules --no-tags \
-      "$REPO" -b "v$REF" "$DEST" || { \
-        CLONE --no-single-branch --no-checkout "$REPO" "$DEST" && \
-        git -C "$DEST" checkout "$REF" && \
-        git -C "$DEST" submodule update --init --filter=blob:none --depth=1 --recursive --jobs 8; \
-    };
+	CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; };
+
+	# Try cloning REF as a tag prefixed with "v", otherwise fall back
+	# to git checkout for commit hashes
+	CLONE --recurse-submodules --shallow-submodules --also-filter-submodules --no-tags \
+	  "$REPO" -b "v$REF" "$DEST" || { \
+	    CLONE --no-single-branch --no-checkout "$REPO" "$DEST" && \
+	    git -C "$DEST" checkout "$REF" && \
+	    git -C "$DEST" submodule update --init --filter=blob:none --depth=1 --recursive --jobs 8; \
+	};
 EOT
 
 RUN chmod 755 /git/clone.sh
@@ -492,11 +492,11 @@ ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
 ARG BUILD_FLASH_ATTN_MAX_JOBS=""
 
 COPY <<-"EOT" /build/fa-build.sh
-    #!/bin/bash
-    set -eo pipefail;
-    if [ -n "$1" ]; then cd "$1"; fi;
-    python3 setup.py bdist_wheel --dist-dir /build/dist \
-    | grep -Ev --line-buffered '^ptxas /tmp/|ptxas info\s*:|bytes spill stores'
+	#!/bin/bash
+	set -eo pipefail;
+	if [ -n "$1" ]; then cd "$1"; fi;
+	python3 setup.py bdist_wheel --dist-dir /build/dist \
+	| grep -Ev --line-buffered '^ptxas /tmp/|ptxas info\s*:|bytes spill stores'
 EOT
 RUN chmod 755 /build/fa-build.sh
 

From ecfac12e014e8ca4321117d8ec458e969b88d0ae Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 19 Nov 2024 15:39:27 -0600
Subject: [PATCH 42/94] build(torch): Invoke `fa-build.sh` correctly

---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 352a0ae3..6351688c 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -517,7 +517,7 @@ RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,tar
       csrc/layer_norm \
       csrc/rotary \
       csrc/xentropy); \
-    do /build/fa-build.sh; done
+    do /build/fa-build.sh "$EXT_DIR" || exit -1; done
 
 FROM flash-attn-builder-base as flash-attn-3-builder
 

From a7d7e04066416e279869f68fc2c9ffa0d106a5cb Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 19 Nov 2024 15:50:22 -0600
Subject: [PATCH 43/94] build(torch): Use tabs for other heredocs

---
 torch/Dockerfile | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 6351688c..33e02d1f 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -26,10 +26,10 @@ WORKDIR /git
 RUN git config --global advice.detachedHead false
 
 COPY <<-"EOT" /git/clone.sh
-    #!/bin/sh
-    REPO="https://github.com/$1";
-    DEST="$2";
-    REF="$3";
+	#!/bin/sh
+	REPO="https://github.com/$1";
+	DEST="$2";
+	REF="$3";
 
 	CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; };
 
@@ -262,31 +262,31 @@ RUN if [ "$(uname -m)" = "aarch64" ]; then \
     gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
 
 COPY <<-"EOT" /build/version-string.sh
-    #!/bin/sh
-    set -e;
-    VERSION="$1";
-
-    IS_HASH() {
-      echo "$1" | grep -qxiEe '[0-9a-f]{40}';
-    };
-
-    if IS_HASH "$VERSION"; then
-      REAL_VERSION="$(cat ./version.txt)";
-      SHORT_HASH="$(echo "$VERSION" | cut -c1-7)";
-      echo "$REAL_VERSION+$SHORT_HASH";
-    else
-      echo "$VERSION";
-    fi;
+	#!/bin/sh
+	set -e;
+	VERSION="$1";
+
+	IS_HASH() {
+	  echo "$1" | grep -qxiEe '[0-9a-f]{40}';
+	};
+
+	if IS_HASH "$VERSION"; then
+	  REAL_VERSION="$(cat ./version.txt)";
+	  SHORT_HASH="$(echo "$VERSION" | cut -c1-7)";
+	  echo "$REAL_VERSION+$SHORT_HASH";
+	else
+	  echo "$VERSION";
+	fi;
 EOT
 RUN chmod 755 /build/version-string.sh
 
 COPY <<-"EOT" /build/storage-info.sh
-    #!/bin/sh
-    set -e;
-    TARGET="$(realpath "$1")";
+	#!/bin/sh
+	set -e;
+	TARGET="$(realpath "$1")";
 
-    STORAGE_INFO="$(df -h '--output=fstype,used,avail,pcent,target' "$TARGET")" || exit 0;
-    printf 'Storage info for %s:\n%s\n' "$TARGET" "$STORAGE_INFO";
+	STORAGE_INFO="$(df -h '--output=fstype,used,avail,pcent,target' "$TARGET")" || exit 0;
+	printf 'Storage info for %s:\n%s\n' "$TARGET" "$STORAGE_INFO";
 EOT
 RUN chmod 755 /build/storage-info.sh
 

From 4e48edbcbc44f201596adbde29df3ee0d7320031 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 19 Nov 2024 15:50:29 -0600
Subject: [PATCH 44/94] build(torch): Use the resulting artifact from
 `flash-attn-3-builder`

[skip ci]
---
 torch/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 33e02d1f..3b8e36e7 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -537,6 +537,7 @@ COPY --link --from=torchaudio-builder /build/dist/ /build/dist/
 COPY --link --from=torchvision-builder /build/dist/ /build/dist/
 COPY --link --from=transformerengine-builder /build/dist/ /build/dist/
 COPY --link --from=flash-attn-builder /build/dist/ /build/dist/
+COPY --link --from=flash-attn-3-builder /build/dist/ /build/dist/
 
 ## Build the final torch image.
 FROM ${FINAL_BASE_IMAGE}

From ac1610abdfafe5e18bb342d94be417e7b3393555 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 19 Nov 2024 16:54:01 -0600
Subject: [PATCH 45/94] build(torch): Filter more lines while building
 `flash-attn`

---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 3b8e36e7..3170cf4b 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -496,7 +496,7 @@ COPY <<-"EOT" /build/fa-build.sh
 	set -eo pipefail;
 	if [ -n "$1" ]; then cd "$1"; fi;
 	python3 setup.py bdist_wheel --dist-dir /build/dist \
-	| grep -Ev --line-buffered '^ptxas /tmp/|ptxas info\s*:|bytes spill stores'
+	| grep -Ev --line-buffered '^ptxas (/tmp/|(info|warning)\s*:)|bytes spill stores'
 EOT
 RUN chmod 755 /build/fa-build.sh
 

From 27f196498f8df4989a91382f9f7c3c23da608423 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 19 Nov 2024 16:57:29 -0600
Subject: [PATCH 46/94] build(torch): Build `flash-attn` and `flash-attn` 3 in
 sequence

[skip ci]
---
 torch/Dockerfile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 3170cf4b..91f75cc1 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -521,6 +521,10 @@ RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,tar
 
 FROM flash-attn-builder-base as flash-attn-3-builder
 
+# Artifically sequence this build stage after the previous one
+# to prevent parallelism, because these are both very resource-intensive
+RUN --mount=type=bind,from=flash-attn-builder,source-/build,target=/build :
+
 # Build flash-attn v3
 RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,target=flash-attention/,rw \
     --mount=type=cache,target=/ccache \

From 83367bcaccd1780489698a957b6da68d7a8f23bc Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 19 Nov 2024 16:58:36 -0600
Subject: [PATCH 47/94] fix(torch): Fix typo in bind mount's `source=`
 parameter

[skip ci]
---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 91f75cc1..99983c7a 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -523,7 +523,7 @@ FROM flash-attn-builder-base as flash-attn-3-builder
 
 # Artifically sequence this build stage after the previous one
 # to prevent parallelism, because these are both very resource-intensive
-RUN --mount=type=bind,from=flash-attn-builder,source-/build,target=/build :
+RUN --mount=type=bind,from=flash-attn-builder,source=/build,target=/build :
 
 # Build flash-attn v3
 RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,target=flash-attention/,rw \

From 915e47ecbca88b4255784ab9bb16c52de7d82e95 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 19 Nov 2024 17:13:17 -0600
Subject: [PATCH 48/94] fix(torch): Use `exit 1` instead of `exit -1`

[skip ci]
---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 99983c7a..d2604b38 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -517,7 +517,7 @@ RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,tar
       csrc/layer_norm \
       csrc/rotary \
       csrc/xentropy); \
-    do /build/fa-build.sh "$EXT_DIR" || exit -1; done
+    do /build/fa-build.sh "$EXT_DIR" || exit 1; done
 
 FROM flash-attn-builder-base as flash-attn-3-builder
 

From 7162b4463d65986c8c9374e4cc6181155cf854cf Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 25 Nov 2024 21:13:47 -0600
Subject: [PATCH 49/94] fix(torch): Broaden criteria to apply PyTorch patch

---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index d2604b38..38f0f72a 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -50,7 +50,7 @@ FROM downloader-base as pytorch-downloader
 ARG BUILD_TORCH_VERSION
 # Includes a patch for a foreach bug in PyTorch v2.5.1
 RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \
-    if [ "${BUILD_TORCH_VERSION}" = '2.5.1' ]; then \
+    if [ "${BUILD_TORCH_VERSION}" != '2.6.0' ]; then \
       wget 'https://github.com/pytorch/pytorch/commit/1cdaf1d85f5e4b3f8952fd0737a1afeb16995d13.patch' -qO- \
       | git -C pytorch apply; \
     fi && \

From d6e73e7edf90150d2169d65d3a2841e05cfb1509 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 25 Nov 2024 21:16:22 -0600
Subject: [PATCH 50/94] feat(torch): Force compilation for compute capability
 9.0a

---
 torch-extras/Dockerfile | 2 +-
 torch/Dockerfile        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 46747c6a..5c8ace94 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -108,7 +108,7 @@ RUN if [ "$(uname -m)" = "aarch64" ]; then \
 COPY --chmod=755 effective_cpu_count.sh .
 COPY --chmod=755 scale.sh .
 
-ARG BUILD_NVCC_APPEND_FLAGS=""
+ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=compute_90a"
 ENV NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS:+$BUILD_NVCC_APPEND_FLAGS }-diag-suppress 186,177"
 ARG BUILD_MAX_JOBS
 
diff --git a/torch/Dockerfile b/torch/Dockerfile
index 38f0f72a..7ea2807d 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -318,7 +318,7 @@ ARG BUILD_TORCH_CUDA_ARCH_LIST
 ENV TORCH_VERSION=$BUILD_TORCH_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
 
-ARG BUILD_NVCC_APPEND_FLAGS=""
+ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=compute_90a"
 ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BUILD_NVCC_APPEND_FLAGS}"
 
 # If the directory /opt/nccl-tests exists,

From b50c6f230db6053105506c6631c33efe178efb39 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 25 Nov 2024 21:21:59 -0600
Subject: [PATCH 51/94] fix(torch): Restore original criteria to apply PyTorch
 v2.5.1 patch

---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 7ea2807d..a8281183 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -50,7 +50,7 @@ FROM downloader-base as pytorch-downloader
 ARG BUILD_TORCH_VERSION
 # Includes a patch for a foreach bug in PyTorch v2.5.1
 RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \
-    if [ "${BUILD_TORCH_VERSION}" != '2.6.0' ]; then \
+    if [ "${BUILD_TORCH_VERSION}" = '2.5.1' ]; then \
       wget 'https://github.com/pytorch/pytorch/commit/1cdaf1d85f5e4b3f8952fd0737a1afeb16995d13.patch' -qO- \
       | git -C pytorch apply; \
     fi && \

From 626b44dce0e6ab35a87e6f47a46ac8795af5ce38 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 25 Nov 2024 21:30:14 -0600
Subject: [PATCH 52/94] feat(torch): Specify string preprocessor definitions
 correctly

---
 torch-extras/Dockerfile | 6 +++---
 torch/Dockerfile        | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 5c8ace94..2af92114 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -97,11 +97,11 @@ COPY compiler_wrapper.f95 .
 ARG AMD64_NATIVE_ARCH="skylake"
 ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres"
 RUN if [ "$(uname -m)" = "aarch64" ]; then \
-      NATIVE="${ARM64_NATIVE_ARCH}" && \
+      NATIVE="WRAPPER_NATIVE=\"${ARM64_NATIVE_ARCH}\"" && \
       AVX='WRAPPER_NO_AVX'; \
     else \
-      NATIVE="${AMD64_NATIVE_ARCH}" && \
-      AVX='WRAPPER_AVX=AVX256'; \
+      NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \
+      AVX='WRAPPER_AVX="AVX256"'; \
     fi && \
     gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
 
diff --git a/torch/Dockerfile b/torch/Dockerfile
index a8281183..29f78b28 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -253,11 +253,11 @@ COPY compiler_wrapper.f95 .
 ARG AMD64_NATIVE_ARCH="skylake"
 ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres"
 RUN if [ "$(uname -m)" = "aarch64" ]; then \
-      NATIVE="${ARM64_NATIVE_ARCH}" && \
+      NATIVE="WRAPPER_NATIVE=\"${ARM64_NATIVE_ARCH}\"" && \
       AVX='WRAPPER_NO_AVX'; \
     else \
-      NATIVE="${AMD64_NATIVE_ARCH}" && \
-      AVX='WRAPPER_AVX=AVX256'; \
+      NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \
+      AVX='WRAPPER_AVX="AVX256"'; \
     fi && \
     gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
 

From 762021f9164bfe2401999cedf2884eaf92827860 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 25 Nov 2024 21:40:49 -0600
Subject: [PATCH 53/94] fix(torch): Install `pybind11` before attempting to
 build Triton

---
 torch/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 29f78b28..07024996 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -307,6 +307,7 @@ ARG BUILD_MAX_JOBS=""
 RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \
     --mount=type=cache,target=/ccache \
     if [ "$BUILD_TRITON" = '1' ]; then \
+      pip3 install --no-cache-dir pybind11 && \
       export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
       cd triton/python && \
       python3 -m pip wheel -w ../../dist/ --no-build-isolation --no-deps -vv . && \

From 04cfc691be58adbb59f67f7928ceb8f3044e8174 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 25 Nov 2024 22:48:29 -0600
Subject: [PATCH 54/94] build(torch): Add missing `$` in `MAX_JOBS` default for
 TE

---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 07024996..be5464e7 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -477,7 +477,7 @@ ENV NVTE_CUDA_ARCHS=$BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST
 RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerEngine,target=TransformerEngine/,rw \
     --mount=type=cache,target=/ccache \
     export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \
-    export MAX_JOBS="${BUILD_MAX_JOBS:-MAX_JOBS}" && \
+    export MAX_JOBS="${BUILD_MAX_JOBS:-$MAX_JOBS}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
     cd TransformerEngine && \
     if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \

From e6dac89c4205cff3a2c57d4b2ed90d0a3946d5b7 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 26 Nov 2024 12:23:17 -0600
Subject: [PATCH 55/94] ci(torch): Drop CUDA 12.2.2 build

---
 .github/configurations/torch-base.yml | 2 +-
 .github/configurations/torch-nccl.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml
index 4c94e304..cb727713 100644
--- a/.github/configurations/torch-base.yml
+++ b/.github/configurations/torch-base.yml
@@ -1,4 +1,4 @@
-cuda: [ 12.6.2, 12.4.1, 12.2.2 ]
+cuda: [ 12.6.2, 12.4.1 ]
 os: [ ubuntu22.04, ubuntu20.04 ]
 include:
   - torch: 2.5.1
diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
index 74da04cc..f6dfd26f 100644
--- a/.github/configurations/torch-nccl.yml
+++ b/.github/configurations/torch-nccl.yml
@@ -1,4 +1,4 @@
-cuda: [ 12.6.2, 12.4.1, 12.2.2 ]
+cuda: [ 12.6.2, 12.4.1 ]
 os: [ ubuntu22.04, ubuntu20.04 ]
 include:
   - torch: 2.5.1

From 56f06edd96b76ca2ab1bd0ac18dda3139793e418 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 26 Nov 2024 12:26:32 -0600
Subject: [PATCH 56/94] feat(torch-extras): Update Apex to `a1df804`

---
 torch-extras/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 2af92114..0b967a0b 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -2,7 +2,7 @@
 
 ARG BASE_IMAGE
 ARG DEEPSPEED_VERSION="0.14.4"
-ARG APEX_COMMIT="23c1f86520e22b505e8fdfcf6298273dff2d93d8"
+ARG APEX_COMMIT="a1df80457ba67d60cbdb0d3ddfb08a2702c821a8"
 ARG DEEPSPEED_KERNELS_COMMIT="e77acc40b104696d4e73229b787d1ef29a9685b1"
 ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST="80;86;89;90"
 ARG XFORMERS_VERSION="0.0.28.post1"

From bcd5fabf67f8d1c42071a0c20fec50c3436b366f Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 2 Dec 2024 19:46:59 -0600
Subject: [PATCH 57/94] feat(torch): Update `torch:nccl` base images for HPC-X
 v2.21

---
 .github/configurations/torch-nccl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
index f6dfd26f..94d85d47 100644
--- a/.github/configurations/torch-nccl.yml
+++ b/.github/configurations/torch-nccl.yml
@@ -5,4 +5,4 @@ include:
     vision: 0.20.0
     audio: 2.5.0
     nccl: 2.23.4-1
-    nccl-tests-hash: c58f522
+    nccl-tests-hash: 3ef8839

From c14235b0e33370ac22f0e4afc3882217b765a273 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 3 Dec 2024 17:09:53 -0600
Subject: [PATCH 58/94] ci: Update to newer self-hosted runners

[skip ci]
---
 .github/workflows/build.yml | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 79191ce7..fdb4cc41 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -34,7 +34,7 @@ jobs:
   build:
     name: Build Images
     runs-on: [ cw ]
-    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.6.0'
     defaults:
       run:
         shell: bash
@@ -57,10 +57,21 @@ jobs:
         with:
           driver: remote
           endpoint: ${{ secrets.BUILDKIT_CONSUMER_ENDPOINT }}
+          append: |
+            - endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }}
+              platforms: linux/amd64
+            - endpoint: ${{ secrets.BUILDKIT_CONSUMER_ARM64_ENDPOINT }}
+              platforms: linux/arm64
         env:
           BUILDER_NODE_0_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
           BUILDER_NODE_0_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
           BUILDER_NODE_0_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
+          BUILDER_NODE_1_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
+          BUILDER_NODE_1_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
+          BUILDER_NODE_1_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
+          BUILDER_NODE_2_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
+          BUILDER_NODE_2_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
+          BUILDER_NODE_2_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
       - name: Get base registry
         run: |
           echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV

From 4f64cf128f208e660d0193b12e2d40142147090e Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 3 Dec 2024 17:10:54 -0600
Subject: [PATCH 59/94] ci(torch): Update CUDA 12.6 builds to 12.6.3; update
 `torch:nccl` bases

---
 .github/configurations/torch-base.yml | 2 +-
 .github/configurations/torch-nccl.yml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml
index cb727713..0ff20e33 100644
--- a/.github/configurations/torch-base.yml
+++ b/.github/configurations/torch-base.yml
@@ -1,4 +1,4 @@
-cuda: [ 12.6.2, 12.4.1 ]
+cuda: [ 12.6.3, 12.4.1 ]
 os: [ ubuntu22.04, ubuntu20.04 ]
 include:
   - torch: 2.5.1
diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
index 94d85d47..bfa98fe2 100644
--- a/.github/configurations/torch-nccl.yml
+++ b/.github/configurations/torch-nccl.yml
@@ -1,8 +1,8 @@
-cuda: [ 12.6.2, 12.4.1 ]
+cuda: [ 12.6.3, 12.4.1 ]
 os: [ ubuntu22.04, ubuntu20.04 ]
 include:
   - torch: 2.5.1
     vision: 0.20.0
     audio: 2.5.0
     nccl: 2.23.4-1
-    nccl-tests-hash: 3ef8839
+    nccl-tests-hash: 007a325

From 0dcf27deab369d1778b24399e6f792d237594b2a Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 3 Dec 2024 17:15:49 -0600
Subject: [PATCH 60/94] fix(torch): Edit `flash-attn` 3 installation for
 compatibility with TE

TransformerEngine expects flash-attn 3 to be importable via:

    import flashattn_hopper.flash_attn_interface

Which requires special shenanigans to support properly.
This change adds that path as a symlink to the original
flash_attn_interface file, and registers it as belonging to the
flashattn-hopper distribution so that it can be uninstalled correctly
if needed.
---
 torch/Dockerfile | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index be5464e7..cb0f20aa 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -627,3 +627,51 @@ WORKDIR /usr/src/app
 RUN --mount=type=bind,from=builder,source=/build/dist,target=. \
     pip3 install --no-cache-dir -U numpy packaging && \
     pip3 install --no-cache-dir -U ./*.whl
+
+# Make a symlink to flash-attn v3 where TransformerEngine expects it,
+# and modify the installation record so that pip uninstall knows how to
+# fully remove it.
+RUN <<-"EOT"
+	#!/bin/env python3
+	from base64 import urlsafe_b64encode as b64
+	from hashlib import sha256
+	from importlib import metadata
+	from pathlib import Path
+	from py_compile import compile
+
+	dist = metadata.distribution("flashattn-hopper")
+	p = dist.locate_file("flash_attn_interface.py")
+	print("flash_attn_interface:", p)
+	root = p.parent
+
+	if not p.exists():
+	    raise SystemExit("flash_attn_interface not found")
+	if not p.is_file():
+	    raise SystemExit("flash_attn_interface path is not a file")
+
+	d = root / "flashattn_hopper"
+	if d.exists():
+	    raise SystemExit(f'"{d}" already exists')
+
+	d.mkdir(mode=0o755, parents=False, exist_ok=False)
+	new = d / p.name
+	new.symlink_to(p)
+	print(f"Created new symlink at {new}")
+
+	compiled = Path(compile(new))
+
+
+	def record_entry(path: Path) -> str:
+	    content = path.read_bytes()
+	    digest = b64(sha256(content).digest()).rstrip(b"=").decode()
+	    package_path = path.relative_to(root).as_posix()
+	    return f"{package_path},sha256={digest},{len(content):d}\r\n"
+
+
+	for f in dist.files:
+	    if f.match("flashattn?hopper-*.dist-info/RECORD"):
+	        with f.locate().open("a", encoding="utf-8", newline="") as record:
+	            for added in (new, compiled):
+	                record.write(record_entry(added))
+	        break
+EOT

From fd6df40bb43433905de96b47db1c119eef21240f Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 16 Dec 2024 14:10:57 -0600
Subject: [PATCH 61/94] fix(torch): Add redundant interpreter specification for
 compatibility

---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index cb0f20aa..4ebf58b6 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -631,7 +631,7 @@ RUN --mount=type=bind,from=builder,source=/build/dist,target=. \
 # Make a symlink to flash-attn v3 where TransformerEngine expects it,
 # and modify the installation record so that pip uninstall knows how to
 # fully remove it.
-RUN <<-"EOT"
+RUN <<-"EOT" python3
 	#!/bin/env python3
 	from base64 import urlsafe_b64encode as b64
 	from hashlib import sha256

From ebaf5aa31ae6d09d8cc28528e9178aa968080273 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 16 Dec 2024 15:11:14 -0600
Subject: [PATCH 62/94] feat(torch): Update LLVM components, including `libomp`
 runtime library

---
 torch-extras/Dockerfile |  9 +++------
 torch/Dockerfile        | 20 +++++++++++++++-----
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 0b967a0b..30bdc784 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -72,18 +72,15 @@ RUN apt-get -qq update && apt-get -qq install -y \
 
 # Update compiler (GCC) and linker (LLD) versions
 # gfortran-11 is just for compiler_wrapper.f95
-RUN CODENAME="$(lsb_release -cs)" && \
-    wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \
-    apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \
-    apt-add-repository -y ppa:ubuntu-toolchain-r/test && \
+RUN LLVM_VERSION='18' && \
     apt-get -qq update && apt-get -qq install --no-install-recommends -y \
-      gcc-11 g++-11 gfortran-11 lld-17 && \
+      gcc-11 g++-11 gfortran-11 "lld-$LLVM_VERSION" && \
     apt-get clean && \
     update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \
     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \
     update-alternatives --install \
       /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \
-    update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1
+    update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1
 
 RUN mkdir /wheels /build
 WORKDIR /build
diff --git a/torch/Dockerfile b/torch/Dockerfile
index 4ebf58b6..7a9e6e50 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -197,13 +197,16 @@ ENV CCACHE_DIR=/ccache \
     CMAKE_CUDA_COMPILER_LAUNCHER=ccache
 
 # Update compiler (GCC) and linker (LLD) versions
-RUN CODENAME="$(lsb_release -cs)" && \
+RUN LLVM_VERSION='18' && \
+    CODENAME="$(lsb_release -cs)" && \
     wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \
-    apt-add-repository -n "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \
+    apt-add-repository -n "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-$LLVM_VERSION main" && \
     SETUP_TOOLCHAIN() { \
         apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \
         | sed -e '/connection timed out/{p; Q1}' && \
-        apt-get -qq install --no-install-recommends -y gcc-11 g++-11 gfortran-11 lld-17 && \
+        apt-get -qq install --no-install-recommends -y \
+          gcc-11 g++-11 gfortran-11 \
+          "lld-$LLVM_VERSION" "libomp-$LLVM_VERSION-dev" && \
         apt-get clean; \
     } && \
     { SETUP_TOOLCHAIN || { sleep "$(shuf -i10-20 -n1)" && SETUP_TOOLCHAIN; }; } && \
@@ -211,7 +214,7 @@ RUN CODENAME="$(lsb_release -cs)" && \
     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \
     update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \
     if [ "$(uname -m)" != 'aarch64' ]; then \
-      update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1; \
+      update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1; \
     fi
 
 # Install AOCL-BLAS and AOCL-LAPACK
@@ -563,7 +566,7 @@ RUN apt-get -qq update && apt-get -qq install -y \
     ldconfig
 
 RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \
-        software-properties-common && \
+        software-properties-common lsb-release && \
     SETUP_LIBSTDCXX() { \
         apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \
         | sed -e '/connection timed out/{p; Q1}' && \
@@ -572,6 +575,13 @@ RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \
     } && \
     { SETUP_LIBSTDCXX || { sleep "$(shuf -i10-20 -n1)" && SETUP_LIBSTDCXX; }; }
 
+RUN LLVM_VERSION='18' && \
+    CODENAME="$(lsb_release -cs)" && \
+    wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \
+    apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-$LLVM_VERSION main" && \
+    apt-get -qq install -y --no-install-recommends "libomp5-$LLVM_VERSION" && \
+    apt-get clean
+
 # Install AOCL-BLAS and AOCL-LAPACK
 # See: https://www.amd.com/en/developer/aocl/dense.html
 ARG AOCL_BASE

From ed93c44190e043c8ee9700b97bf0ebdef3d0d207 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 16 Dec 2024 15:32:10 -0600
Subject: [PATCH 63/94] ci: Remove deprecated BuildKit runner endpoint

[skip ci]
---
 .github/workflows/build.yml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index fdb4cc41..2c825b36 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -56,10 +56,9 @@ jobs:
         uses: docker/setup-buildx-action@v3.7.1
         with:
           driver: remote
-          endpoint: ${{ secrets.BUILDKIT_CONSUMER_ENDPOINT }}
+          endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }}
+          platforms: linux/amd64
           append: |
-            - endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }}
-              platforms: linux/amd64
             - endpoint: ${{ secrets.BUILDKIT_CONSUMER_ARM64_ENDPOINT }}
               platforms: linux/arm64
         env:
@@ -69,9 +68,6 @@ jobs:
           BUILDER_NODE_1_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
           BUILDER_NODE_1_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
           BUILDER_NODE_1_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
-          BUILDER_NODE_2_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
-          BUILDER_NODE_2_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
-          BUILDER_NODE_2_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
       - name: Get base registry
         run: |
           echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV

From 2da2fb144e5aad8b613f1d62b181963ef591a8e3 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 24 Dec 2024 12:30:22 -0600
Subject: [PATCH 64/94] feat(torch): Upgrade TransformerEngine to v1.13

---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 7a9e6e50..4ee75b02 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -5,7 +5,7 @@ ARG FINAL_BASE_IMAGE="nvidia/cuda:12.4.1-base-ubuntu22.04"
 ARG BUILD_TORCH_VERSION="2.5.1"
 ARG BUILD_TORCH_VISION_VERSION="0.20.0"
 ARG BUILD_TORCH_AUDIO_VERSION="2.5.0"
-ARG BUILD_TRANSFORMERENGINE_VERSION="1.12"
+ARG BUILD_TRANSFORMERENGINE_VERSION="1.13"
 ARG BUILD_FLASH_ATTN_VERSION="2.6.3"
 ARG BUILD_FLASH_ATTN_3_VERSION="2.7.0.post2"
 ARG BUILD_TRITON_VERSION=""

From 09ed20035d253a1f79cd97320d570f38d26a6d73 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Tue, 24 Dec 2024 12:31:02 -0600
Subject: [PATCH 65/94] fix(torch): Add `-ffree-line-length-512` to `gfortran`
 invocations

[skip ci]
---
 torch-extras/Dockerfile | 2 +-
 torch/Dockerfile        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 30bdc784..5a6c66f4 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -100,7 +100,7 @@ RUN if [ "$(uname -m)" = "aarch64" ]; then \
       NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \
       AVX='WRAPPER_AVX="AVX256"'; \
     fi && \
-    gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
+    gfortran -ffree-line-length-512 -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
 
 COPY --chmod=755 effective_cpu_count.sh .
 COPY --chmod=755 scale.sh .
diff --git a/torch/Dockerfile b/torch/Dockerfile
index 4ee75b02..a9360dd0 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -262,7 +262,7 @@ RUN if [ "$(uname -m)" = "aarch64" ]; then \
       NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \
       AVX='WRAPPER_AVX="AVX256"'; \
     fi && \
-    gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
+    gfortran -ffree-line-length-512 -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
 
 COPY <<-"EOT" /build/version-string.sh
 	#!/bin/sh

From 8f95f595b3c9e75b279811042fb12a5b338bb388 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Thu, 9 Jan 2025 17:59:18 -0600
Subject: [PATCH 66/94] feat(torch): Update `flash-attention` 2 & 3 to v2.7.2

---
 torch/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index a9360dd0..1aacffbc 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -6,8 +6,8 @@ ARG BUILD_TORCH_VERSION="2.5.1"
 ARG BUILD_TORCH_VISION_VERSION="0.20.0"
 ARG BUILD_TORCH_AUDIO_VERSION="2.5.0"
 ARG BUILD_TRANSFORMERENGINE_VERSION="1.13"
-ARG BUILD_FLASH_ATTN_VERSION="2.6.3"
-ARG BUILD_FLASH_ATTN_3_VERSION="2.7.0.post2"
+ARG BUILD_FLASH_ATTN_VERSION="2.7.2.post1"
+ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1"
 ARG BUILD_TRITON_VERSION=""
 ARG BUILD_TRITON="1"
 ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX"

From 8e290756f90c6ca5ac5c8e83f02b1c5ed63a45a3 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Thu, 9 Jan 2025 21:58:38 -0600
Subject: [PATCH 67/94] ci(torch): Drop Ubuntu 20.04 CI builds

---
 .github/configurations/torch-base.yml | 2 +-
 .github/configurations/torch-nccl.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml
index 0ff20e33..d0dcc24f 100644
--- a/.github/configurations/torch-base.yml
+++ b/.github/configurations/torch-base.yml
@@ -1,5 +1,5 @@
 cuda: [ 12.6.3, 12.4.1 ]
-os: [ ubuntu22.04, ubuntu20.04 ]
+os: [ ubuntu22.04 ]
 include:
   - torch: 2.5.1
     vision: 0.20.0
diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
index bfa98fe2..21c3ed57 100644
--- a/.github/configurations/torch-nccl.yml
+++ b/.github/configurations/torch-nccl.yml
@@ -1,5 +1,5 @@
 cuda: [ 12.6.3, 12.4.1 ]
-os: [ ubuntu22.04, ubuntu20.04 ]
+os: [ ubuntu22.04 ]
 include:
   - torch: 2.5.1
     vision: 0.20.0

From 459aa230741312515b82f7e36b89a5ec6db424d6 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 29 Jan 2025 01:27:56 -0600
Subject: [PATCH 68/94] ci: Re-enable multi-arch builds

[skip ci]
---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 2c825b36..6eb3b114 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -109,7 +109,7 @@ jobs:
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
           cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
-          platforms: linux/amd64
+          platforms: linux/amd64,linux/arm64
       - name: Clear registry credentials
         if: always()
         run: |

From cd1019bd7e3d131755e3c42904ebebb5d3e7d2e6 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 29 Jan 2025 01:28:38 -0600
Subject: [PATCH 69/94] build(torch): Add new build targets with CUDA 12.8.0

---
 .github/configurations/torch-base.yml |  2 +-
 .github/configurations/torch-nccl.yml |  6 +--
 torch/Dockerfile                      | 56 ++++++++++++++++-----------
 3 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml
index d0dcc24f..18d5caa7 100644
--- a/.github/configurations/torch-base.yml
+++ b/.github/configurations/torch-base.yml
@@ -1,4 +1,4 @@
-cuda: [ 12.6.3, 12.4.1 ]
+cuda: [ 12.8.0, 12.6.3, 12.4.1 ]
 os: [ ubuntu22.04 ]
 include:
   - torch: 2.5.1
diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
index 21c3ed57..0c84b40f 100644
--- a/.github/configurations/torch-nccl.yml
+++ b/.github/configurations/torch-nccl.yml
@@ -1,8 +1,8 @@
-cuda: [ 12.6.3, 12.4.1 ]
+cuda: [ 12.8.0, 12.6.3, 12.4.1 ]
 os: [ ubuntu22.04 ]
 include:
   - torch: 2.5.1
     vision: 0.20.0
     audio: 2.5.0
-    nccl: 2.23.4-1
-    nccl-tests-hash: 007a325
+    nccl: 2.25.1-1
+    nccl-tests-hash: 4e02d6a
diff --git a/torch/Dockerfile b/torch/Dockerfile
index 1aacffbc..1994ed7a 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -1,6 +1,6 @@
 # syntax=docker/dockerfile:1.4
-ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.4.1-devel-ubuntu22.04"
-ARG FINAL_BASE_IMAGE="nvidia/cuda:12.4.1-base-ubuntu22.04"
+ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.8.0-devel-ubuntu22.04"
+ARG FINAL_BASE_IMAGE="nvidia/cuda:12.8.0-base-ubuntu22.04"
 
 ARG BUILD_TORCH_VERSION="2.5.1"
 ARG BUILD_TORCH_VISION_VERSION="0.20.0"
@@ -21,7 +21,7 @@ ARG AOCL_URL="https://download.amd.com/developer/eula/aocl/aocl-4-2/aocl-linux-a
 
 # Clone PyTorch repositories independently from all other build steps
 # for cache-friendliness and parallelization
-FROM alpine/git:2.40.1 as downloader-base
+FROM alpine/git:2.40.1 AS downloader-base
 WORKDIR /git
 RUN git config --global advice.detachedHead false
 
@@ -46,7 +46,7 @@ EOT
 RUN chmod 755 /git/clone.sh
 
 
-FROM downloader-base as pytorch-downloader
+FROM downloader-base AS pytorch-downloader
 ARG BUILD_TORCH_VERSION
 # Includes a patch for a foreach bug in PyTorch v2.5.1
 RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \
@@ -56,12 +56,12 @@ RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \
     fi && \
     rm -rf pytorch/.git
 
-FROM downloader-base as torchvision-downloader
+FROM downloader-base AS torchvision-downloader
 ARG BUILD_TORCH_VISION_VERSION
 RUN ./clone.sh pytorch/vision vision "${BUILD_TORCH_VISION_VERSION}" && \
     rm -rf vision/.git
 
-FROM downloader-base as torchaudio-downloader
+FROM downloader-base AS torchaudio-downloader
 ARG BUILD_TORCH_AUDIO_VERSION
 RUN ./clone.sh pytorch/audio audio "${BUILD_TORCH_AUDIO_VERSION}"
 # The torchaudio build requires that this directory remain a full git repository,
@@ -77,15 +77,15 @@ RUN if grep -qF '#include <float.h>' \
     fi && \
     rm /git/patch
 
-FROM downloader-base as transformerengine-downloader
+FROM downloader-base AS transformerengine-downloader
 ARG BUILD_TRANSFORMERENGINE_VERSION
 RUN ./clone.sh NVIDIA/TransformerEngine TransformerEngine "${BUILD_TRANSFORMERENGINE_VERSION}"
 
-FROM downloader-base as flash-attn-downloader
+FROM downloader-base AS flash-attn-downloader
 ARG BUILD_FLASH_ATTN_VERSION
 RUN ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_VERSION}"
 
-FROM downloader-base as flash-attn-3-downloader
+FROM downloader-base AS flash-attn-3-downloader
 ARG BUILD_FLASH_ATTN_3_VERSION
 RUN if [ -n "$BUILD_FLASH_ATTN_3_VERSION" ]; then \
       ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_3_VERSION}"; \
@@ -93,7 +93,7 @@ RUN if [ -n "$BUILD_FLASH_ATTN_3_VERSION" ]; then \
       mkdir flash-attention; \
     fi
 
-FROM downloader-base as triton-version
+FROM downloader-base AS triton-version
 ENV TRITON_COMMIT_FILE='.ci/docker/ci_commit_pins/triton.txt'
 COPY --link --from=pytorch-downloader "/git/pytorch/${TRITON_COMMIT_FILE}" /git/version.txt
 ARG BUILD_TRITON_VERSION
@@ -101,7 +101,7 @@ RUN if [ -n "${BUILD_TRITON_VERSION}" ]; then \
       echo "${BUILD_TRITON_VERSION}" > /git/version.txt; \
     fi
 
-FROM downloader-base as triton-downloader
+FROM downloader-base AS triton-downloader
 COPY --link --from=triton-version /git/version.txt /git/version.txt
 ARG BUILD_TRITON
 RUN if [ "${BUILD_TRITON}" = '1' ]; then \
@@ -110,7 +110,7 @@ RUN if [ "${BUILD_TRITON}" = '1' ]; then \
       mkdir triton; \
     fi
 
-FROM alpine/curl:8.7.1 as aocl-downloader
+FROM alpine/curl:8.7.1 AS aocl-downloader
 WORKDIR /tmp/install
 
 RUN apk add --no-cache bash
@@ -136,7 +136,7 @@ RUN curl -sSfo- "${AOCL_URL}" | tar xzf - --strip-components 1 && \
 
 
 ## Build PyTorch on a builder image.
-FROM ${BUILDER_BASE_IMAGE} as builder-base
+FROM ${BUILDER_BASE_IMAGE} AS builder-base-shared
 ENV DEBIAN_FRONTEND=noninteractive
 
 ARG BUILD_CCACHE_SIZE="1Gi"
@@ -215,8 +215,16 @@ RUN LLVM_VERSION='18' && \
     update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \
     if [ "$(uname -m)" != 'aarch64' ]; then \
       update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1; \
-    fi
+    fi && \
+    ldconfig
+
 
+FROM builder-base-shared AS builder-base-arm64
+# There is currently no CPU BLAS used for ARM builds,
+# so this stage is just an alias
+
+
+FROM builder-base-shared AS builder-base-amd64
 # Install AOCL-BLAS and AOCL-LAPACK
 # See: https://www.amd.com/en/developer/aocl/dense.html
 ARG AOCL_BASE
@@ -248,6 +256,8 @@ ENV C_INCLUDE_PATH="${AOCL_BASE}/include${C_INCLUDE_PATH:+:$C_INCLUDE_PATH}" \
     LD_LIBRARY_PATH="${AOCL_BASE}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" \
     LIBRARY_PATH="${AOCL_BASE}/lib${LIBRARY_PATH:+:$LIBRARY_PATH}"
 
+
+FROM builder-base-${TARGETARCH} AS builder-base
 RUN mkdir /build /build/dist
 WORKDIR /build
 COPY --chmod=755 effective_cpu_count.sh .
@@ -322,8 +332,8 @@ ARG BUILD_TORCH_CUDA_ARCH_LIST
 ENV TORCH_VERSION=$BUILD_TORCH_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
 
-ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=compute_90a"
-ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BUILD_NVCC_APPEND_FLAGS}"
+ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a -gencode=arch=compute_100,code=[sm_100,compute_100]"
+ENV NVCC_APPEND_FLAGS="-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BUILD_NVCC_APPEND_FLAGS}"
 
 # If the directory /opt/nccl-tests exists,
 # the base image is assumed to be nccl-tests,
@@ -394,7 +404,7 @@ RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl
 RUN python3 -m pip install -U --no-cache-dir \
       packaging setuptools wheel pip
 
-FROM builder-base as torchvision-builder
+FROM builder-base AS torchvision-builder
 RUN rm ./dist/*
 
 ## Build torchvision
@@ -432,7 +442,7 @@ RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=visi
     TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     python3 setup.py bdist_wheel --dist-dir ../dist
 
-FROM builder-base as torchaudio-builder
+FROM builder-base AS torchaudio-builder
 RUN rm ./dist/*
 
 ## Build torchaudio
@@ -470,7 +480,7 @@ RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/
     TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     python3 setup.py bdist_wheel --dist-dir ../dist
 
-FROM builder-base as transformerengine-builder
+FROM builder-base AS transformerengine-builder
 RUN rm ./dist/*
 
 # Build TransformerEngine
@@ -489,7 +499,7 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE
     fi && \
     python3 setup.py bdist_wheel --dist-dir /build/dist
 
-FROM builder-base as flash-attn-builder-base
+FROM builder-base AS flash-attn-builder-base
 RUN rm ./dist/*
 ENV PYTHONUNBUFFERED=1
 ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
@@ -504,7 +514,7 @@ COPY <<-"EOT" /build/fa-build.sh
 EOT
 RUN chmod 755 /build/fa-build.sh
 
-FROM flash-attn-builder-base as flash-attn-builder
+FROM flash-attn-builder-base AS flash-attn-builder
 
 # Build flash-attn
 RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \
@@ -523,7 +533,7 @@ RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,tar
       csrc/xentropy); \
     do /build/fa-build.sh "$EXT_DIR" || exit 1; done
 
-FROM flash-attn-builder-base as flash-attn-3-builder
+FROM flash-attn-builder-base AS flash-attn-3-builder
 
 # Artifically sequence this build stage after the previous one
 # to prevent parallelism, because these are both very resource-intensive
@@ -540,7 +550,7 @@ RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,t
     echo "MAX_JOBS: ${MAX_JOBS}" && \
     /build/fa-build.sh flash-attention/hopper
 
-FROM builder-base as builder
+FROM builder-base AS builder
 COPY --link --from=torchaudio-builder /build/dist/ /build/dist/
 COPY --link --from=torchvision-builder /build/dist/ /build/dist/
 COPY --link --from=transformerengine-builder /build/dist/ /build/dist/

From 643b362d7b62b1d565494004585ea6d17b3904c8 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 29 Jan 2025 01:36:41 -0600
Subject: [PATCH 70/94] ci(torch): Update `nccl-tests` commit hash

---
 .github/configurations/torch-nccl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
index 0c84b40f..71c96ce3 100644
--- a/.github/configurations/torch-nccl.yml
+++ b/.github/configurations/torch-nccl.yml
@@ -5,4 +5,4 @@ include:
     vision: 0.20.0
     audio: 2.5.0
     nccl: 2.25.1-1
-    nccl-tests-hash: 4e02d6a
+    nccl-tests-hash: 57fa979

From b58974c2a02fd66212566821bcd01a9197eee1d7 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 29 Jan 2025 02:06:18 -0600
Subject: [PATCH 71/94] build(torch): Filter `compute_100` build on older CUDA
 versions

---
 torch/Dockerfile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 1994ed7a..a26b655a 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -333,6 +333,10 @@ ENV TORCH_VERSION=$BUILD_TORCH_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
 
 ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a -gencode=arch=compute_100,code=[sm_100,compute_100]"
+# Remove compute_100 build if NV_CUDA_LIB_VERSION doesn't match 12.[89].*
+ARG BUILD_NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS}:::${NV_CUDA_LIB_VERSION}"
+ARG BUILD_NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS%:::12.[89].*}"
+ARG BUILD_NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS% -gencode=arch=compute_100,code=\[sm_100,compute_100\]:::*}"
 ENV NVCC_APPEND_FLAGS="-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BUILD_NVCC_APPEND_FLAGS}"
 
 # If the directory /opt/nccl-tests exists,
@@ -366,6 +370,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
     --mount=type=cache,target=/ccache \
     export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
+    echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
     ./storage-info.sh . && \
     cd pytorch && \
     ../storage-info.sh . && \

From 7095c593814106093906dc6a50e30a8a2a9a6988 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 29 Jan 2025 02:26:51 -0600
Subject: [PATCH 72/94] build(torch): Switch `NVCC_APPEND_FLAGS` to not be an
 `ENV` directive

---
 torch-extras/Dockerfile | 16 ++++++++++++----
 torch/Dockerfile        | 24 ++++++++++++++++++------
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 5a6c66f4..415608b4 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -106,7 +106,11 @@ COPY --chmod=755 effective_cpu_count.sh .
 COPY --chmod=755 scale.sh .
 
 ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=compute_90a"
-ENV NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS:+$BUILD_NVCC_APPEND_FLAGS }-diag-suppress 186,177"
+RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \
+    case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \
+      FLAGS="${FLAGS} -gencode=arch=compute_100,code=[sm_100,compute_100]" ;; \
+    esac && \
+    echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf
 ARG BUILD_MAX_JOBS
 
 
@@ -114,6 +118,7 @@ FROM builder-base as deepspeed-builder
 
 ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST
 RUN --mount=type=bind,from=ds-kernels-downloader,source=/git/ds-kernels,target=ds-kernels/,rw \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
     cd ds-kernels && \
     export CUDA_ARCH_LIST="${DEEPSPEED_KERNELS_CUDA_ARCH_LIST}" && \
     echo "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}" && \
@@ -146,8 +151,9 @@ ARG DS_BUILD_AIO=""
 ARG DEEPSPEED_VERSION
 
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
-RUN python3 -m pip install -U --no-cache-dir \
-      setuptools wheel pip deepspeed-kernels py-cpuinfo && \
+RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    python3 -m pip install -U --no-cache-dir \
+      setuptools wheel pip py-cpuinfo && \
     if python3 -m pip show torch | grep 'Version: 2\.[1-9]' > /dev/null; then \
       # DeepSpeed's AIO extension is incompatible with PyTorch 2.1.x's
       # requirement for C++17 (as of DeepSpeed 0.10.1).
@@ -198,6 +204,7 @@ RUN LIBNCCL2_VERSION=$(dpkg-query --showformat='${Version}' --show libnccl2) &&
 # --distributed_adam, --distributed_lamb, and --group_norm aren't documented
 # in the Apex README, but are defined in its setup.py config.
 RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
     python3 -m pip install -U --no-cache-dir \
       packaging setuptools wheel pip && \
     CUDA_MAJOR_VERSION=$(echo "${CUDA_VERSION}" | cut -d. -f1) && \
@@ -251,7 +258,8 @@ FROM builder-base as xformers-builder
 ARG XFORMERS_VERSION
 
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
-RUN python3 -m pip install -U --no-cache-dir \
+RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    python3 -m pip install -U --no-cache-dir \
       setuptools wheel pip && \
     CC=$(realpath -e ./compiler) \
       MAX_JOBS=1 \
diff --git a/torch/Dockerfile b/torch/Dockerfile
index a26b655a..795e1172 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -332,12 +332,13 @@ ARG BUILD_TORCH_CUDA_ARCH_LIST
 ENV TORCH_VERSION=$BUILD_TORCH_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
 
-ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a -gencode=arch=compute_100,code=[sm_100,compute_100]"
-# Remove compute_100 build if NV_CUDA_LIB_VERSION doesn't match 12.[89].*
-ARG BUILD_NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS}:::${NV_CUDA_LIB_VERSION}"
-ARG BUILD_NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS%:::12.[89].*}"
-ARG BUILD_NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS% -gencode=arch=compute_100,code=\[sm_100,compute_100\]:::*}"
-ENV NVCC_APPEND_FLAGS="-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BUILD_NVCC_APPEND_FLAGS}"
+ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a"
+# Add compute_100 build if NV_CUDA_LIB_VERSION matches 12.[89].*
+RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \
+    case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \
+      FLAGS="${FLAGS} -gencode=arch=compute_100,code=[sm_100,compute_100]" ;; \
+    esac && \
+    echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf
 
 # If the directory /opt/nccl-tests exists,
 # the base image is assumed to be nccl-tests,
@@ -370,6 +371,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
     --mount=type=cache,target=/ccache \
     export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
     echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
     ./storage-info.sh . && \
     cd pytorch && \
@@ -422,6 +424,8 @@ RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=visi
     --mount=type=cache,target=/ccache \
     export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
     cd vision && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
@@ -460,6 +464,8 @@ RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/
     --mount=type=cache,target=/ccache \
     export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
     cd audio && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
@@ -497,6 +503,8 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE
     export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \
     export MAX_JOBS="${BUILD_MAX_JOBS:-$MAX_JOBS}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
     cd TransformerEngine && \
     if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \
       sed -i "s/from functools import cache/from functools import lru_cache as cache/g" \
@@ -527,6 +535,8 @@ RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,tar
     export CC=$(realpath -e ./compiler) \
       MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
     cd flash-attention && \
     for EXT_DIR in $(realpath -s -e \
       . \
@@ -553,6 +563,8 @@ RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,t
     export CC=$(realpath -e ./compiler) \
       MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 10 6)}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
     /build/fa-build.sh flash-attention/hopper
 
 FROM builder-base AS builder

From 440d8441146fdfb22a9ea3237ba684a978b70573 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 29 Jan 2025 19:10:55 -0600
Subject: [PATCH 73/94] ci: Build only for `linux/amd64` again

[skip ci]
---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6eb3b114..2c825b36 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -109,7 +109,7 @@ jobs:
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
           cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
-          platforms: linux/amd64,linux/arm64
+          platforms: linux/amd64
       - name: Clear registry credentials
         if: always()
         run: |

From 6bc6fb62a5dce9a7825131e4fd7f22deaab42604 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 29 Jan 2025 19:13:30 -0600
Subject: [PATCH 74/94] feat(torch): Build with PyTorch v2.6.0

---
 .github/configurations/torch-base.yml | 6 +++---
 .github/configurations/torch-nccl.yml | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml
index 18d5caa7..1d7cebe2 100644
--- a/.github/configurations/torch-base.yml
+++ b/.github/configurations/torch-base.yml
@@ -1,6 +1,6 @@
 cuda: [ 12.8.0, 12.6.3, 12.4.1 ]
 os: [ ubuntu22.04 ]
 include:
-  - torch: 2.5.1
-    vision: 0.20.0
-    audio: 2.5.0
+  - torch: 2.6.0
+    vision: 0.21.0
+    audio: 2.6.0
diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
index 71c96ce3..826ddf93 100644
--- a/.github/configurations/torch-nccl.yml
+++ b/.github/configurations/torch-nccl.yml
@@ -1,8 +1,8 @@
 cuda: [ 12.8.0, 12.6.3, 12.4.1 ]
 os: [ ubuntu22.04 ]
 include:
-  - torch: 2.5.1
-    vision: 0.20.0
-    audio: 2.5.0
+  - torch: 2.6.0
+    vision: 0.21.0
+    audio: 2.6.0
     nccl: 2.25.1-1
     nccl-tests-hash: 57fa979

From ba41ff96cc1648369e0a34a2bd6795749ccb417c Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 29 Jan 2025 19:41:33 -0600
Subject: [PATCH 75/94] feat(torch): Build with `flash-attn` v2.7.4.post1

---
 torch/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 795e1172..441d9129 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -6,8 +6,8 @@ ARG BUILD_TORCH_VERSION="2.5.1"
 ARG BUILD_TORCH_VISION_VERSION="0.20.0"
 ARG BUILD_TORCH_AUDIO_VERSION="2.5.0"
 ARG BUILD_TRANSFORMERENGINE_VERSION="1.13"
-ARG BUILD_FLASH_ATTN_VERSION="2.7.2.post1"
-ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1"
+ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1"
+ARG BUILD_FLASH_ATTN_3_VERSION="2.7.4.post1"
 ARG BUILD_TRITON_VERSION=""
 ARG BUILD_TRITON="1"
 ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX"

From 25e8a9ed55da858e1b9526fd3c8034ad8665ea27 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 29 Jan 2025 19:45:08 -0600
Subject: [PATCH 76/94] build(torch): Build both CXX11 ABI variants

---
 .github/configurations/torch-base.yml | 1 +
 .github/configurations/torch-nccl.yml | 1 +
 .github/workflows/torch-base.yml      | 3 ++-
 .github/workflows/torch-nccl.yml      | 3 ++-
 .github/workflows/torch.yml           | 8 ++++++++
 torch/Dockerfile                      | 4 ++++
 6 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml
index 1d7cebe2..f75b79a5 100644
--- a/.github/configurations/torch-base.yml
+++ b/.github/configurations/torch-base.yml
@@ -1,5 +1,6 @@
 cuda: [ 12.8.0, 12.6.3, 12.4.1 ]
 os: [ ubuntu22.04 ]
+abi: [ 1, 0 ]
 include:
   - torch: 2.6.0
     vision: 0.21.0
diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
index 826ddf93..ec1f1f91 100644
--- a/.github/configurations/torch-nccl.yml
+++ b/.github/configurations/torch-nccl.yml
@@ -1,5 +1,6 @@
 cuda: [ 12.8.0, 12.6.3, 12.4.1 ]
 os: [ ubuntu22.04 ]
+abi: [ 1, 0 ]
 include:
   - torch: 2.6.0
     vision: 0.21.0
diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml
index 93148a65..a6fc2fb1 100644
--- a/.github/workflows/torch-base.yml
+++ b/.github/workflows/torch-base.yml
@@ -35,11 +35,12 @@ jobs:
     secrets: inherit
     with:
       image-name: ${{ inputs.image-name }}
-      tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
+      tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }}
       builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-${{ matrix.os }}
       base-image: nvidia/cuda:${{ matrix.cuda }}-base-${{ matrix.os }}
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
       torchaudio-version: ${{ matrix.audio }}
+      cxx11-abi: ${{ matrix.abi }}
       cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml
index 9815639d..feae8f7b 100644
--- a/.github/workflows/torch-nccl.yml
+++ b/.github/workflows/torch-nccl.yml
@@ -43,11 +43,12 @@ jobs:
     secrets: inherit
     with:
       image-name: ${{ inputs.image-name }}
-      tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.cuda, matrix.os, matrix.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
+      tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.cuda, matrix.os, matrix.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }}
       builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
       base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
       torchaudio-version: ${{ matrix.audio }}
+      cxx11-abi: ${{ matrix.abi }}
       cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml
index 36bdcc6d..b0eb4634 100644
--- a/.github/workflows/torch.yml
+++ b/.github/workflows/torch.yml
@@ -22,6 +22,9 @@ on:
       triton-version:
         required: false
         type: string
+      cxx11-abi:
+        required: false
+        type: string
       cuda-arch-support:
         required: false
         type: string
@@ -67,6 +70,10 @@ on:
         required: false
         description: "Tagged version number from openai/triton to build"
         type: string
+      cxx11-abi:
+        required: false
+        description: "Build with the CXX11 ABI (1 = enable, 0 = disable)"
+        type: string
       cuda-arch-support:
         required: false
         description: "Space-separated list of CUDA architectures to support"
@@ -99,6 +106,7 @@ jobs:
         BUILD_TORCH_VERSION=${{ inputs.torch-version }}
         BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }}
         BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }}
+        ${{ inputs.cxx11-abi && format('BUILD_CXX11_ABI={0}', inputs.cxx11-abi) || '' }}
         ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }}
         ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }}
   build-extras:
diff --git a/torch/Dockerfile b/torch/Dockerfile
index 441d9129..5ca9715d 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -366,6 +366,7 @@ RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \
 # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271).
 # Without WITH_BLAS, it would detect the BLAS implementation as
 # BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either.
+ARG BUILD_CXX11_ABI=""
 SHELL ["/bin/bash", "-eo", "pipefail", "-c"]
 RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \
     --mount=type=cache,target=/ccache \
@@ -373,6 +374,9 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
     echo "MAX_JOBS: ${MAX_JOBS}" && \
     export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
     echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
+    if [ -n "${BUILD_CXX11_ABI}" ]; then \
+      export _GLIBCXX_USE_CXX11_ABI="${BUILD_CXX11_ABI}"; \
+    fi && \
     ./storage-info.sh . && \
     cd pytorch && \
     ../storage-info.sh . && \

From 77574c2f694148d48ef49cca069d2835364f82ed Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Wed, 29 Jan 2025 19:49:34 -0600
Subject: [PATCH 77/94] ci(torch): Remove parameterization of
 `TORCH_CUDA_ARCH_LIST`

This is to fit within the 10-parameter limit for reusable workflows.
---
 .github/workflows/torch.yml | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml
index b0eb4634..6538dff6 100644
--- a/.github/workflows/torch.yml
+++ b/.github/workflows/torch.yml
@@ -25,10 +25,6 @@ on:
       cxx11-abi:
         required: false
         type: string
-      cuda-arch-support:
-        required: false
-        type: string
-        default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
       image-name:
         required: false
         type: string
@@ -74,11 +70,6 @@ on:
         required: false
         description: "Build with the CXX11 ABI (1 = enable, 0 = disable)"
         type: string
-      cuda-arch-support:
-        required: false
-        description: "Space-separated list of CUDA architectures to support"
-        type: string
-        default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
       image-name:
         required: false
         description: "Custom name under which to publish the resulting container"
@@ -106,8 +97,8 @@ jobs:
         BUILD_TORCH_VERSION=${{ inputs.torch-version }}
         BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }}
         BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }}
+        BUILD_TORCH_CUDA_ARCH_LIST=7.0 7.5 8.0 8.6 8.9 9.0+PTX
         ${{ inputs.cxx11-abi && format('BUILD_CXX11_ABI={0}', inputs.cxx11-abi) || '' }}
-        ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }}
         ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }}
   build-extras:
     name: Build torch-extras

From 486738eb35217f03e08faaa320dec92b8e98d379 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Thu, 30 Jan 2025 11:19:26 -0600
Subject: [PATCH 78/94] build(torch): Downgrade `flash-attn` 3 to the
 2.7.2.post1 tag

2.7.4.post1 has build conflicts flash-attn 2 and with compute_100
---
 torch/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 5ca9715d..4f5862d0 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -7,7 +7,7 @@ ARG BUILD_TORCH_VISION_VERSION="0.20.0"
 ARG BUILD_TORCH_AUDIO_VERSION="2.5.0"
 ARG BUILD_TRANSFORMERENGINE_VERSION="1.13"
 ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1"
-ARG BUILD_FLASH_ATTN_3_VERSION="2.7.4.post1"
+ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1"
 ARG BUILD_TRITON_VERSION=""
 ARG BUILD_TRITON="1"
 ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX"

From 2da31d97755e049ad4c4bee31a7ce30b1c0cd4dc Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Sat, 1 Feb 2025 12:18:22 -0600
Subject: [PATCH 79/94] ci: Re-enable ARM64 builds again

---
 .github/workflows/build.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 2c825b36..603b0b97 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -34,7 +34,7 @@ jobs:
   build:
     name: Build Images
     runs-on: [ cw ]
-    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.6.0'
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0'
     defaults:
       run:
         shell: bash
@@ -109,7 +109,7 @@ jobs:
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
           cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
-          platforms: linux/amd64
+          platforms: linux/amd64,linux/arm64
       - name: Clear registry credentials
         if: always()
         run: |

From ac7f89d73d623500396ab30458fa09c598d02f60 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Sat, 1 Feb 2025 12:19:55 -0600
Subject: [PATCH 80/94] ci(torch): Increase `torch` image build job timeout

---
 .github/workflows/torch.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml
index 6538dff6..4b27af68 100644
--- a/.github/workflows/torch.yml
+++ b/.github/workflows/torch.yml
@@ -82,6 +82,7 @@ on:
 
 jobs:
   build:
+    timeout-minutes: 960
     name: Build torch
     uses: ./.github/workflows/build.yml
     secrets: inherit

From f9ffd6f282ca266d2e26f984fa1181cafeabc739 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Sat, 1 Feb 2025 12:34:23 -0600
Subject: [PATCH 81/94] ci(torch): Increase all job timeouts

[skip ci]
---
 .github/workflows/build.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 603b0b97..3adc93e5 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -35,6 +35,7 @@ jobs:
     name: Build Images
     runs-on: [ cw ]
     container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0'
+    timeout-minutes: 960
     defaults:
       run:
         shell: bash

From fb567b854d07c605b654d3afd18e5178bb62e4e0 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Sat, 1 Feb 2025 12:34:57 -0600
Subject: [PATCH 82/94] ci(torch): Remove `torch`-specific job timeout override

---
 .github/workflows/torch.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml
index 4b27af68..6538dff6 100644
--- a/.github/workflows/torch.yml
+++ b/.github/workflows/torch.yml
@@ -82,7 +82,6 @@ on:
 
 jobs:
   build:
-    timeout-minutes: 960
     name: Build torch
     uses: ./.github/workflows/build.yml
     secrets: inherit

From 0bd89968d09542ab328ea3e0ab54962fdd6aa223 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Sun, 2 Feb 2025 21:29:06 -0600
Subject: [PATCH 83/94] build(torch-extras): Specify DeepSpeed build flags
 better

[skip ci]
---
 torch-extras/Dockerfile | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 415608b4..55e58fa7 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -128,25 +128,25 @@ RUN --mount=type=bind,from=ds-kernels-downloader,source=/git/ds-kernels,target=d
 
 # DeepSpeed build flags
 # See: https://www.deepspeed.ai/tutorials/advanced-install
-ARG DS_BUILD_OPS="1"
+ARG DS_BUILD_OPS="0"
 ARG DS_BUILD_CCL_COMM="0"
-ARG DS_BUILD_CPU_ADAM=""
-ARG DS_BUILD_CPU_LION=""
+ARG DS_BUILD_CPU_ADAM="1"
+ARG DS_BUILD_CPU_LION="1"
 # Requires CUTLASS
 ARG DS_BUILD_EVOFORMER_ATTN="0"
-ARG DS_BUILD_FUSED_ADAM=""
-ARG DS_BUILD_FUSED_LION=""
-ARG DS_BUILD_CPU_ADAGRAD=""
-ARG DS_BUILD_FUSED_LAMB=""
-ARG DS_BUILD_QUANTIZER=""
-ARG DS_BUILD_RANDOM_LTD=""
+ARG DS_BUILD_FUSED_ADAM="1"
+ARG DS_BUILD_FUSED_LION="1"
+ARG DS_BUILD_CPU_ADAGRAD="1"
+ARG DS_BUILD_FUSED_LAMB="1"
+ARG DS_BUILD_QUANTIZER="1"
+ARG DS_BUILD_RANDOM_LTD="1"
 # sparse_attn has issues with PyTorch >= 2.0.0 as of DeepSpeed 0.9.4
 ARG DS_BUILD_SPARSE_ATTN="0"
-ARG DS_BUILD_TRANSFORMER=""
-ARG DS_BUILD_TRANSFORMER_INFERENCE=""
-ARG DS_BUILD_STOCHASTIC_TRANSFORMER=""
-ARG DS_BUILD_UTILS=""
-ARG DS_BUILD_AIO=""
+ARG DS_BUILD_TRANSFORMER="1"
+ARG DS_BUILD_TRANSFORMER_INFERENCE="1"
+ARG DS_BUILD_STOCHASTIC_TRANSFORMER="1"
+ARG DS_BUILD_UTILS="1"
+ARG DS_BUILD_AIO="1"
 
 ARG DEEPSPEED_VERSION
 
@@ -185,8 +185,9 @@ RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
     } && \
     CC=$(realpath -e ./compiler) \
       MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)}" \
+      DS_ACCELERATOR='cuda' \
       python3 -m pip wheel -w /wheels \
-      --no-cache-dir --no-build-isolation --no-deps \
+      --no-cache-dir --no-build-isolation --no-deps -v \
       deepspeed==${DEEPSPEED_VERSION} && \
     rm ./*
 SHELL ["/bin/sh", "-c"]

From 386fabe4d6e3af40fe9fc6c72e62d2c4af0545bc Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Mon, 3 Feb 2025 01:36:39 -0600
Subject: [PATCH 84/94] build(torch-extras): Remove `DS_ACCELERATOR`
 specification

[skip ci]
---
 torch-extras/Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 55e58fa7..fbd5c29f 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -185,7 +185,6 @@ RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
     } && \
     CC=$(realpath -e ./compiler) \
       MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)}" \
-      DS_ACCELERATOR='cuda' \
       python3 -m pip wheel -w /wheels \
       --no-cache-dir --no-build-isolation --no-deps -v \
       deepspeed==${DEEPSPEED_VERSION} && \

From 45dd5a0a956f9032e16ea63ee7516a93de2f1151 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Fri, 7 Feb 2025 00:45:35 -0600
Subject: [PATCH 85/94] build(torch): Enable less-hacky 10.0 arch support in
 PyTorch

---
 torch/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 4f5862d0..421b2e16 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -10,8 +10,8 @@ ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1"
 ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1"
 ARG BUILD_TRITON_VERSION=""
 ARG BUILD_TRITON="1"
-ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX"
-ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST="70;80;89;90"
+ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0 10.0+PTX"
+ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST="70;80;89;90;100"
 
 # 8.7 is supported in the PyTorch main branch, but not 2.0.0
 

From 90d178ba5ed7003f361963f891847247163a6bf7 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Fri, 7 Feb 2025 00:56:57 -0600
Subject: [PATCH 86/94] feat(sglang): Add `sglang` image

---
 .github/workflows/sglang.yml |  30 +++++++
 sglang/Dockerfile            |  29 +++++++
 sglang/build.bash            | 149 +++++++++++++++++++++++++++++++++++
 sglang/install.bash          |  32 ++++++++
 4 files changed, 240 insertions(+)
 create mode 100644 .github/workflows/sglang.yml
 create mode 100644 sglang/Dockerfile
 create mode 100644 sglang/build.bash
 create mode 100644 sglang/install.bash

diff --git a/.github/workflows/sglang.yml b/.github/workflows/sglang.yml
new file mode 100644
index 00000000..a851ecba
--- /dev/null
+++ b/.github/workflows/sglang.yml
@@ -0,0 +1,30 @@
+on:
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: 'Tag for the build'
+        required: true
+      base-image:
+        description: 'Base image from which to build'
+        required: true
+      builder-image:
+        description: 'Image to use to compile wheels, if different from the base image'
+        required: false
+  push:
+    paths:
+      - "sglang/**"
+      - ".github/workflows/sglang.yml"
+      - ".github/workflows/build.yml"
+
+
+jobs:
+  build:
+    uses: ./.github/workflows/build.yml
+    secrets: inherit
+    with:
+      image-name: sglang
+      folder: sglang
+      tag-suffix: ${{ inputs.tag || '386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1' }}
+      build-args: |
+        BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:es-actions-386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1'}}
+        ${{ inputs.base-image && 'BASE_IMAGE=' }}${{ inputs.base-image}}
diff --git a/sglang/Dockerfile b/sglang/Dockerfile
new file mode 100644
index 00000000..9adb87b0
--- /dev/null
+++ b/sglang/Dockerfile
@@ -0,0 +1,29 @@
+# syntax=docker/dockerfile:1.2
+ARG BASE_IMAGE
+ARG BUILDER_IMAGE="${BASE_IMAGE}"
+
+FROM ${BUILDER_IMAGE} AS builder
+
+ARG BUILD_TORCH_CUDA_ARCH_LIST='8.0 8.6 8.9 9.0 10.0+PTX'
+
+ARG FLASHINFER_COMMIT='c04755e21f4d6fb7813c703f2b00a7ef012be9b8'
+ARG CUTLASS_COMMIT='b78588d1630aa6643bf021613717bafb705df4ef'
+ARG VLLM_COMMIT='5095e966069b9e65b7c4c63427e06cebacaad0a0'
+ARG SGLANG_COMMIT='4b6f62e2bc52a528551e9a21e7b0a4945c6115bb'
+ARG DECORD_COMMIT='d2e56190286ae394032a8141885f76d5372bd44b'
+# Building Triton is not currently enabled,
+# but this is the commit that would be used if it were
+ARG TRITON_COMMIT='1e0e51c4aeb3e1beea000da5d0e494f8b9ac40dd'
+
+WORKDIR /build
+COPY build.bash /build/
+RUN mkdir /wheels && \
+    bash build.bash -a "${BUILD_TORCH_CUDA_ARCH_LIST}" && \
+    rm -rf /build/*
+COPY install.bash /wheels/
+
+FROM ${BASE_IMAGE}
+RUN --mount=type=bind,from=builder,source=/wheels,target=/wheels \
+    cd /wheels && \
+    bash install.bash
+RUN rmdir /wheels
diff --git a/sglang/build.bash b/sglang/build.bash
new file mode 100644
index 00000000..c0e0a7ad
--- /dev/null
+++ b/sglang/build.bash
@@ -0,0 +1,149 @@
+#!/bin/bash
+set -xeo pipefail
+
+TORCH_CUDA_ARCH_LIST=''
+FILTER_ARCHES=''
+BUILD_TRITON=''
+
+while getopts 'a:ft' OPT; do
+  case "${OPT}" in
+    a) TORCH_CUDA_ARCH_LIST="${OPTARG}" ;;
+    f) FILTER_ARCHES='1' ;;
+    t) BUILD_TRITON='1' ;;
+    *) exit 92 ;;
+  esac
+done
+
+export NVCC_APPEND_FLAGS='-gencode=arch=compute_100,code=[sm_100,compute_100] -gencode=arch=compute_100a,code=sm_100a --diag-suppress 174'
+export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0 10.0+PTX}"
+
+mkdir -p /wheels/logs
+
+_BUILD() { python3 -m build -w -n -v -o /wheels "${1:-.}"; }
+_LOG() { tee -a "/wheels/logs/${1:?}"; }
+_CONSTRAINTS="$(python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p')"
+_PIP_INSTALL() {
+  python3 -m pip install --no-cache-dir \
+  --constraint=/dev/stdin <<< "${_CONSTRAINTS}" \
+  "$@"
+}
+
+_PIP_INSTALL -U pip setuptools wheel build pybind11 ninja cmake
+
+# triton (not compatible with torch 2.6)
+if [ "${BUILD_TRITON}" = 1 ]; then (
+  : "${TRITON_COMMIT:?}"
+  echo 'Building triton-lang/triton'
+  git clone --recursive --filter=blob:none https://github.com/triton-lang/triton
+  cd triton
+  git checkout "${TRITON_COMMIT}"
+  _BUILD python |& _LOG triton.log
+); fi
+
+# flashinfer
+: "${FLASHINFER_COMMIT:?}"
+: "${CUTLASS_COMMIT:?}"
+(
+echo 'Building flashinfer-ai/flashinfer'
+git clone --recursive --filter=blob:none https://github.com/flashinfer-ai/flashinfer
+cd flashinfer
+git checkout "${FLASHINFER_COMMIT}"
+sed -i 's/name = "flashinfer-python"/name = "flashinfer"/' pyproject.toml
+git -C 3rdparty/cutlass checkout "${CUTLASS_COMMIT}"
+_PIP_INSTALL -U optree
+NVCC_APPEND_FLAGS="${NVCC_APPEND_FLAGS:+$NVCC_APPEND_FLAGS } --diag-suppress 20281,174" \
+  FLASHINFER_ENABLE_AOT=1 _BUILD . |& _LOG flashinfer.log
+)
+
+# Setup cutlass repo for vLLM to use
+git clone --recursive --filter=blob:none https://github.com/NVIDIA/cutlass
+git -C cutlass checkout "${CUTLASS_COMMIT}"
+
+# vLLM
+: "${VLLM_COMMIT:?}"
+(
+echo 'Building vllm-project/vllm'
+export VLLM_CUTLASS_SRC_DIR="${PWD}/cutlass"
+test -d "${VLLM_CUTLASS_SRC_DIR}"
+git clone --recursive --filter=blob:none https://github.com/vllm-project/vllm
+cd vllm
+git checkout "${VLLM_COMMIT}"
+# For lsmod
+apt-get -qq update && apt-get -qq install kmod
+python3 use_existing_torch.py
+_PIP_INSTALL -r requirements-build.txt
+_BUILD . |& _LOG vllm.log
+)
+
+# sglang
+: "${SGLANG_COMMIT:?}"
+(
+echo 'Building sglang'
+git clone --recursive --filter=blob:none https://github.com/sgl-project/sglang
+cd sglang
+git checkout "${SGLANG_COMMIT}"
+(
+cd sgl-kernel
+git -C 3rdparty/cutlass checkout "${CUTLASS_COMMIT}"
+git -C 3rdparty/flashinfer/3rdparty/cutlass checkout "${CUTLASS_COMMIT}"
+
+ARCH_TRIPLE="$(gcc -print-multiarch)"
+LIB_DIR="/usr/lib/${ARCH_TRIPLE:?}"
+test -d "${LIB_DIR:?}"
+PYTHON_API_VER="$(
+  python3 --version | sed -En 's@Python ([0-9])\.([0-9]+)\..*@cp\1\2@p'
+)"
+ARCH_FILTER=()
+if [ "${FILTER_ARCHES}" = 1 ]; then
+  ARCH_FILTER=(-e 's@"-gencode=arch=compute_[78][0-9],code=sm_[78][0-9]",@#\0@')
+fi
+
+sed -Ei \
+  "${ARCH_FILTER[@]}" \
+  -e 's@/usr/lib/x86_64-linux-gnu@'"${LIB_DIR}"'@' \
+  -e 's@(\s+)(\w.+manylinux2014_x86_64.+)@\1pass  # \2@' \
+  -e 's@\{"py_limited_api": "cp39"}@{"py_limited_api": "'"${PYTHON_API_VER:-cp310}"'"}@' \
+  setup.py
+SGL_KERNEL_ENABLE_BF16=1 SGL_KERNEL_ENABLE_FP8=1 SGL_KERNEL_ENABLE_SM90A=1 \
+  _BUILD . |& _LOG sglang.log
+)
+_BUILD python |& _LOG sglang.log
+)
+
+# decord and xgrammar aren't available on PyPI for ARM64
+
+if [ ! "$(uname -m)" = 'x86_64' ]; then
+  # xgrammar (for sglang)
+  (
+  git clone --recursive --filter=blob:none -b v0.1.11 https://github.com/mlc-ai/xgrammar && \
+  cd xgrammar
+  (
+  mkdir build && cd build
+  cmake -S.. -B. -DCMAKE_BUILD_TYPE=Release -GNinja |& _LOG xgrammar.log
+  cmake --build . |& _LOG xgrammar.log
+  )
+  _BUILD python |& _LOG xgrammar.log
+  )
+
+  # decord (for sglang)
+  : "${DECORD_COMMIT:?}"
+  (
+  apt-get -qq update && apt-get -q install --no-install-recommends \
+    build-essential python3-dev python3-setuptools \
+    make cmake ffmpeg \
+    libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev
+  git clone --recursive --filter=blob:none https://github.com/dmlc/decord
+  cd decord
+  git checkout "${DECORD_COMMIT}"
+  (
+  mkdir build && cd build
+  cmake -S.. -B. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release -GNinja |& _LOG decord.log
+  cmake --build . |& _LOG decord.log
+  cp libdecord.so /wheels/libdecord.so
+  )
+  cd python
+  _BUILD . |& _LOG decord.log
+  )
+fi
+
+apt-get clean
diff --git a/sglang/install.bash b/sglang/install.bash
new file mode 100644
index 00000000..dfd39370
--- /dev/null
+++ b/sglang/install.bash
@@ -0,0 +1,32 @@
+#!/bin/bash
+set -xeo pipefail
+
+_CONSTRAINTS="$(
+  python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p'
+)"
+_PIP_INSTALL() {
+  python3 -m pip install --no-cache-dir \
+  --constraint=/dev/stdin <<< "${_CONSTRAINTS}" \
+  "$@"
+}
+
+_PIP_INSTALL /wheels/*.whl
+if [ -x /wheels/libdecord.so ]; then
+  apt-get -qq update && apt-get -q install --no-install-recommends \
+    libavfilter7 libavformat58 && \
+  apt-get clean
+  cp /wheels/libdecord.so /usr/local/lib/ && ldconfig
+fi
+
+SGLANG_EXTRA_PIP_DEPENDENCIES=()
+if [ "$(uname -m)" = 'x86_64' ]; then
+  SGLANG_EXTRA_PIP_DEPENDENCIES=('decord' 'xgrammar>=0.1.10')
+fi
+_PIP_INSTALL \
+  'aiohttp' 'fastapi' \
+  'hf_transfer' 'huggingface_hub' 'interegular' 'modelscope' \
+  'orjson' 'packaging' 'pillow' 'prometheus-client>=0.20.0' \
+  'psutil' 'pydantic' 'python-multipart' 'pyzmq>=25.1.2' \
+  'torchao>=0.7.0' 'uvicorn' 'uvloop' \
+  'cuda-python' 'outlines>=0.0.44,<0.1.0' \
+  "${SGLANG_EXTRA_PIP_DEPENDENCIES[@]}"

From 855d2f3d893e598b913bfc1ba124da6e8efc7dee Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Fri, 7 Feb 2025 01:14:02 -0600
Subject: [PATCH 87/94] build(sglang): Use `USE_CUDNN` and `USE_CUSPARSELT`
 flags in vLLM build

---
 sglang/build.bash | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sglang/build.bash b/sglang/build.bash
index c0e0a7ad..e58ff0fd 100644
--- a/sglang/build.bash
+++ b/sglang/build.bash
@@ -72,7 +72,7 @@ git checkout "${VLLM_COMMIT}"
 apt-get -qq update && apt-get -qq install kmod
 python3 use_existing_torch.py
 _PIP_INSTALL -r requirements-build.txt
-_BUILD . |& _LOG vllm.log
+USE_CUDNN=1 USE_CUSPARSELT=1 _BUILD . |& _LOG vllm.log
 )
 
 # sglang

From dc70ca8e2ee57b4654e0779166c82bea7087e51b Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Fri, 7 Feb 2025 10:00:30 -0600
Subject: [PATCH 88/94] fix(sglang): Remove extraneous `rmdir` build step

---
 sglang/Dockerfile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sglang/Dockerfile b/sglang/Dockerfile
index 9adb87b0..2103ca20 100644
--- a/sglang/Dockerfile
+++ b/sglang/Dockerfile
@@ -26,4 +26,3 @@ FROM ${BASE_IMAGE}
 RUN --mount=type=bind,from=builder,source=/wheels,target=/wheels \
     cd /wheels && \
     bash install.bash
-RUN rmdir /wheels

From f113d38bcc90da52b6da06ae395b9f07f520fd79 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Fri, 7 Feb 2025 12:40:25 -0600
Subject: [PATCH 89/94] fix(sglang): Skip `apt` prompts

---
 sglang/build.bash   | 5 +++--
 sglang/install.bash | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/sglang/build.bash b/sglang/build.bash
index e58ff0fd..d72d7a37 100644
--- a/sglang/build.bash
+++ b/sglang/build.bash
@@ -1,5 +1,6 @@
 #!/bin/bash
 set -xeo pipefail
+export DEBIAN_FRONTEND=noninteractive
 
 TORCH_CUDA_ARCH_LIST=''
 FILTER_ARCHES=''
@@ -69,7 +70,7 @@ git clone --recursive --filter=blob:none https://github.com/vllm-project/vllm
 cd vllm
 git checkout "${VLLM_COMMIT}"
 # For lsmod
-apt-get -qq update && apt-get -qq install kmod
+apt-get -qq update && apt-get -qq install --no-install-recommends -y kmod
 python3 use_existing_torch.py
 _PIP_INSTALL -r requirements-build.txt
 USE_CUDNN=1 USE_CUSPARSELT=1 _BUILD . |& _LOG vllm.log
@@ -128,7 +129,7 @@ if [ ! "$(uname -m)" = 'x86_64' ]; then
   # decord (for sglang)
   : "${DECORD_COMMIT:?}"
   (
-  apt-get -qq update && apt-get -q install --no-install-recommends \
+  apt-get -qq update && apt-get -q install --no-install-recommends -y \
     build-essential python3-dev python3-setuptools \
     make cmake ffmpeg \
     libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev
diff --git a/sglang/install.bash b/sglang/install.bash
index dfd39370..07c23b6b 100644
--- a/sglang/install.bash
+++ b/sglang/install.bash
@@ -1,5 +1,6 @@
 #!/bin/bash
 set -xeo pipefail
+export DEBIAN_FRONTEND=noninteractive
 
 _CONSTRAINTS="$(
   python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p'
@@ -12,7 +13,7 @@ _PIP_INSTALL() {
 
 _PIP_INSTALL /wheels/*.whl
 if [ -x /wheels/libdecord.so ]; then
-  apt-get -qq update && apt-get -q install --no-install-recommends \
+  apt-get -qq update && apt-get -q install --no-install-recommends -y \
     libavfilter7 libavformat58 && \
   apt-get clean
   cp /wheels/libdecord.so /usr/local/lib/ && ldconfig

From f63ddbebf21c2a658482bfca96ce68cc3d101687 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Fri, 14 Feb 2025 14:30:40 -0600
Subject: [PATCH 90/94] ci(torch-nightly): Update runner image version

---
 .github/workflows/torch-nightly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
index fb51869f..4693b963 100644
--- a/.github/workflows/torch-nightly.yml
+++ b/.github/workflows/torch-nightly.yml
@@ -20,7 +20,7 @@ jobs:
     name:
       Get Nightly Info
     runs-on: [ cw ]
-    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0'
     defaults:
       run:
         shell: bash

From 868a61122f8c5b6c8e0d063ee568a5fa8b7d9808 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Fri, 14 Feb 2025 14:31:11 -0600
Subject: [PATCH 91/94] ci: Parameterize build platforms

[skip ci]
---
 .github/workflows/build.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 3adc93e5..83707c88 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -19,6 +19,11 @@ on:
         required: false
         description: "Optional sub-key to append to the image name for build layer caching"
         type: string
+      platforms:
+        required: false
+        description: "Platforms for which to build (default: linux/amd64,linux/arm64)"
+        type: string
+        default: linux/amd64,linux/arm64
     outputs:
       outcome:
         description: "The outcome of the build"
@@ -110,7 +115,7 @@ jobs:
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
           cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
-          platforms: linux/amd64,linux/arm64
+          platforms: ${{ inputs.platforms }}
       - name: Clear registry credentials
         if: always()
         run: |

From 0e4411609d924aeddf1a997020191b1f689623d1 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Fri, 14 Feb 2025 14:34:38 -0600
Subject: [PATCH 92/94] fix(torch): Filter 10.0 arch builds on unsupported CUDA
 versions again

The previous method didn't work when 10.0 was included in the
BUILD_TORCH_CUDA_ARCH_LIST build argument, so this uses shell
parameter expansion hackery to get around that.

This also keeps the previous logic, but switches it
to force sm_100a builds on supported CUDA versions.
---
 torch-extras/Dockerfile |  2 +-
 torch/Dockerfile        | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index fbd5c29f..51346e0b 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -108,7 +108,7 @@ COPY --chmod=755 scale.sh .
 ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=compute_90a"
 RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \
     case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \
-      FLAGS="${FLAGS} -gencode=arch=compute_100,code=[sm_100,compute_100]" ;; \
+      FLAGS="${FLAGS} -gencode=arch=compute_100a,code=sm_100a" ;; \
     esac && \
     echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf
 ARG BUILD_MAX_JOBS
diff --git a/torch/Dockerfile b/torch/Dockerfile
index 421b2e16..5db5c4aa 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -1,4 +1,4 @@
-# syntax=docker/dockerfile:1.4
+# syntax=docker/dockerfile:1.7
 ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.8.0-devel-ubuntu22.04"
 ARG FINAL_BASE_IMAGE="nvidia/cuda:12.8.0-base-ubuntu22.04"
 
@@ -330,13 +330,17 @@ RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,r
 ARG BUILD_TORCH_VERSION
 ARG BUILD_TORCH_CUDA_ARCH_LIST
 ENV TORCH_VERSION=$BUILD_TORCH_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
+# Filter out the 10.0 arch on CUDA versions != 12.8
+ENV TORCH_CUDA_ARCH_LIST="${CUDA_VERSION##12.8.*}||${BUILD_TORCH_CUDA_ARCH_LIST/ 10.0/}||${BUILD_TORCH_CUDA_ARCH_LIST}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#||*||}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST%||*}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}"
 
 ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a"
-# Add compute_100 build if NV_CUDA_LIB_VERSION matches 12.[89].*
+# Add sm_100a build if NV_CUDA_LIB_VERSION matches 12.[89].*
 RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \
     case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \
-      FLAGS="${FLAGS} -gencode=arch=compute_100,code=[sm_100,compute_100]" ;; \
+      FLAGS="${FLAGS} -gencode=arch=compute_100a,code=sm_100a" ;; \
     esac && \
     echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf
 
@@ -629,7 +633,11 @@ ARG BUILD_TORCH_CUDA_ARCH_LIST
 ENV TORCH_VERSION=$BUILD_TORCH_VERSION
 ENV TORCH_VISION_VERSION=$BUILD_TORCH_VISION_VERSION
 ENV TORCH_AUDIO_VERSION=$BUILD_TORCH_AUDIO_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
+# Filter out the 10.0 arch on CUDA versions != 12.8
+ENV TORCH_CUDA_ARCH_LIST="${CUDA_VERSION##12.8.*}||${BUILD_TORCH_CUDA_ARCH_LIST/ 10.0/}||${BUILD_TORCH_CUDA_ARCH_LIST}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#||*||}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST%||*}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}"
 
 COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh
 # - libnvjitlink-X-Y only exists for CUDA versions >= 12-0.

From 2f37df6dfa8a8aab07e18eb2ef5b54e40267a1b5 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Fri, 14 Feb 2025 15:37:20 -0600
Subject: [PATCH 93/94] fix(torch): Filter 10.0 arch builds in
 TransformerEngine build

---
 torch/Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/Dockerfile b/torch/Dockerfile
index 5db5c4aa..97f0f759 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -513,6 +513,9 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE
     echo "MAX_JOBS: ${MAX_JOBS}" && \
     export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
     echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
+    case "${CUDA_VERSION}" in 12.[0123456].*) \
+      export NVTE_CUDA_ARCHS="${NVTE_CUDA_ARCHS%;100*}" ;; \
+    esac && \
     cd TransformerEngine && \
     if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \
       sed -i "s/from functools import cache/from functools import lru_cache as cache/g" \

From 68fbfd11691200e55d920709b9a4802b639b0c29 Mon Sep 17 00:00:00 2001
From: Eta <esyra@coreweave.com>
Date: Fri, 14 Feb 2025 16:10:12 -0600
Subject: [PATCH 94/94] ci(torch): Rework logic for passing various build
 arguments

---
 .github/workflows/torch-base.yml    |  2 +-
 .github/workflows/torch-nccl.yml    |  2 +-
 .github/workflows/torch-nightly.yml |  8 ++++----
 .github/workflows/torch.yml         | 17 ++++-------------
 torch/Dockerfile                    |  2 +-
 5 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml
index a6fc2fb1..b93fbbae 100644
--- a/.github/workflows/torch-base.yml
+++ b/.github/workflows/torch-base.yml
@@ -41,6 +41,6 @@ jobs:
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
       torchaudio-version: ${{ matrix.audio }}
-      cxx11-abi: ${{ matrix.abi }}
+      additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }}
       cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml
index feae8f7b..ede0fdf0 100644
--- a/.github/workflows/torch-nccl.yml
+++ b/.github/workflows/torch-nccl.yml
@@ -49,6 +49,6 @@ jobs:
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
       torchaudio-version: ${{ matrix.audio }}
-      cxx11-abi: ${{ matrix.abi }}
+      additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }}
       cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
index 4693b963..139d23d9 100644
--- a/.github/workflows/torch-nightly.yml
+++ b/.github/workflows/torch-nightly.yml
@@ -93,13 +93,13 @@ jobs:
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-base.yml
-      filter: 'del(.include) | .exclude |= . + [{"os": "ubuntu20.04"}]'
+      filter: 'del(.include) | .exclude |= . + [{"abi": "0"}]'
   get-nccl-config:
     name: Get torch:nccl Config
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-nccl.yml
-      filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"os": "ubuntu20.04"}]'
+      filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"abi": "0"}]'
 
   build-base:
     name: Build Nightly torch:base
@@ -119,7 +119,7 @@ jobs:
       torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
       torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
       torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
-      triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }}
+      additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }}
       cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
   build-nccl:
@@ -140,6 +140,6 @@ jobs:
       torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
       torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
       torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
-      triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }}
+      additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }}
       cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml
index 6538dff6..938b4306 100644
--- a/.github/workflows/torch.yml
+++ b/.github/workflows/torch.yml
@@ -19,10 +19,7 @@ on:
       torchaudio-version:
         required: true
         type: string
-      triton-version:
-        required: false
-        type: string
-      cxx11-abi:
+      additional-build-args:
         required: false
         type: string
       image-name:
@@ -62,13 +59,9 @@ on:
         required: true
         description: "Tagged version number from pytorch/audio to build"
         type: string
-      triton-version:
-        required: false
-        description: "Tagged version number from openai/triton to build"
-        type: string
-      cxx11-abi:
+      additional-build-args:
         required: false
-        description: "Build with the CXX11 ABI (1 = enable, 0 = disable)"
+        description: "Further --build-arg parameters for the build"
         type: string
       image-name:
         required: false
@@ -97,9 +90,7 @@ jobs:
         BUILD_TORCH_VERSION=${{ inputs.torch-version }}
         BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }}
         BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }}
-        BUILD_TORCH_CUDA_ARCH_LIST=7.0 7.5 8.0 8.6 8.9 9.0+PTX
-        ${{ inputs.cxx11-abi && format('BUILD_CXX11_ABI={0}', inputs.cxx11-abi) || '' }}
-        ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }}
+        ${{ inputs.additional-build-args }}
   build-extras:
     name: Build torch-extras
     if: inputs.build-extras
diff --git a/torch/Dockerfile b/torch/Dockerfile
index 97f0f759..e070232a 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -10,7 +10,7 @@ ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1"
 ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1"
 ARG BUILD_TRITON_VERSION=""
 ARG BUILD_TRITON="1"
-ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0 10.0+PTX"
+ARG BUILD_TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0 10.0+PTX"
 ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST="70;80;89;90;100"
 
 # 8.7 is supported in the PyTorch main branch, but not 2.0.0