diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml
index 761f2b6b..f75b79a5 100644
--- a/.github/configurations/torch-base.yml
+++ b/.github/configurations/torch-base.yml
@@ -1,6 +1,7 @@
-cuda: [ 12.6.1, 12.4.1, 12.2.2 ]
-os: [ ubuntu22.04, ubuntu20.04 ]
+cuda: [ 12.8.0, 12.6.3, 12.4.1 ]
+os: [ ubuntu22.04 ]
+abi: [ 1, 0 ]
 include:
-  - torch: 2.5.0
-    vision: 0.20.0
-    audio: 2.5.0
+  - torch: 2.6.0
+    vision: 0.21.0
+    audio: 2.6.0
diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
index 6bd5f029..ec1f1f91 100644
--- a/.github/configurations/torch-nccl.yml
+++ b/.github/configurations/torch-nccl.yml
@@ -1,37 +1,9 @@
-image:
-  # Ubuntu 22.04
-  - cuda: 12.6.1
-    cudnn: cudnn
-    os: ubuntu22.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
-  - cuda: 12.4.1
-    cudnn: cudnn
-    os: ubuntu22.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
-  - cuda: 12.2.2
-    cudnn: cudnn8
-    os: ubuntu22.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
-  # Ubuntu 20.04
-  - cuda: 12.6.1
-    cudnn: cudnn
-    os: ubuntu20.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
-  - cuda: 12.4.1
-    cudnn: cudnn
-    os: ubuntu20.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
-  - cuda: 12.2.2
-    cudnn: cudnn8
-    os: ubuntu20.04
-    nccl: 2.21.5-1
-    nccl-tests-hash: 2ff05b2
+cuda: [ 12.8.0, 12.6.3, 12.4.1 ]
+os: [ ubuntu22.04 ]
+abi: [ 1, 0 ]
 include:
-  - torch: 2.5.0
-    vision: 0.20.0
-    audio: 2.5.0
+  - torch: 2.6.0
+    vision: 0.21.0
+    audio: 2.6.0
+    nccl: 2.25.1-1
+    nccl-tests-hash: 57fa979
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index cbb09fdc..83707c88 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -19,6 +19,11 @@ on:
         required: false
         description: "Optional sub-key to append to the image name for build layer caching"
         type: string
+      platforms:
+        required: false
+        description: "Platforms for which to build (default: linux/amd64,linux/arm64)"
+        type: string
+        default: linux/amd64,linux/arm64
     outputs:
       outcome:
         description: "The outcome of the build"
@@ -33,26 +38,42 @@ on:
 jobs:
   build:
     name: Build Images
-    runs-on: [ self-hosted, Linux ]
+    runs-on: [ cw ]
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0'
+    timeout-minutes: 960
+    defaults:
+      run:
+        shell: bash
     outputs:
       outcome: ${{ steps.docker-build.outcome }}
       tags: ${{ steps.meta.outputs.tags }}
       version: ${{ steps.meta.outputs.version }}
     steps:
-      - uses: actions/checkout@v3
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2.2.1
-      - name: Login to GitHub container registry
-        uses: docker/login-action@v2.2.0
+      - uses: actions/checkout@v4
+      - name: Fetch BuildKit Client Certs
+        uses: dopplerhq/secrets-fetch-action@v1.2.0
+        id: client-certs
         with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Login to DockerHub container registry
-        uses: docker/login-action@v2.2.0
+          doppler-token: ${{ secrets.ORG_BUILDKIT_CLIENT_TOKEN }}
+          doppler-project: ${{ secrets.BUILDKIT_CONSUMER_DOPPLER_PROJECT }}
+          doppler-config: prod
+          inject-env-vars: false
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3.7.1
         with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+          driver: remote
+          endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }}
+          platforms: linux/amd64
+          append: |
+            - endpoint: ${{ secrets.BUILDKIT_CONSUMER_ARM64_ENDPOINT }}
+              platforms: linux/arm64
+        env:
+          BUILDER_NODE_0_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
+          BUILDER_NODE_0_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
+          BUILDER_NODE_0_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
+          BUILDER_NODE_1_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
+          BUILDER_NODE_1_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
+          BUILDER_NODE_1_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
       - name: Get base registry
         run: |
           echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV
@@ -70,14 +91,21 @@ jobs:
           echo "CACHE_KEY=${{ inputs.image-name }}-${{ inputs.cache-key }}" >> $GITHUB_ENV
       - name: Extract metadata (tags, labels) for Docker
         id: meta
-        uses: docker/metadata-action@v4.1.1
+        uses: docker/metadata-action@v5.5.1
         with:
           images: ${{ env.REGISTRY }}/${{ inputs.image-name }}
           tags: |
             type=sha,prefix=${{ env.TAG_PREFIX }},suffix=${{ env.TAG_SUFFIX }},format=short
+      - name: Initialize registry credentials file
+        env:
+          USER: ${{ github.actor }}
+          PASS: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          jq -n '.auths."ghcr.io" = { username: env.USER, password: env.PASS }' \
+          | install -m400 /dev/stdin ~/.docker/config.json
       - name: Build and push Docker image
         id: docker-build
-        uses: docker/build-push-action@v3.2.0
+        uses: docker/build-push-action@v6.9.0
         with:
           context: ${{ inputs.folder }}
           build-args: |-
@@ -87,6 +115,11 @@ jobs:
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
           cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
+          platforms: ${{ inputs.platforms }}
+      - name: Clear registry credentials
+        if: always()
+        run: |
+          rm -f ~/.docker/config.json && [ ! -e ~/.docker/config.json ]
       - uses: 8BitJonny/gh-get-current-pr@2.1.3
         id: PR
         with:
diff --git a/.github/workflows/read-configuration.yml b/.github/workflows/read-configuration.yml
index 12a21b31..25f5de3e 100644
--- a/.github/workflows/read-configuration.yml
+++ b/.github/workflows/read-configuration.yml
@@ -17,12 +17,16 @@ on:
 jobs:
   read-file:
     name: Read Configuration File
-    runs-on: ["self-hosted", "Linux"]
+    runs-on: [ cw ]
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
+    defaults:
+      run:
+        shell: bash
     permissions: {}
     outputs:
       config: ${{ steps.read.outputs.contents }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Read configuration
         id: read
         env:
diff --git a/.github/workflows/sglang.yml b/.github/workflows/sglang.yml
new file mode 100644
index 00000000..a851ecba
--- /dev/null
+++ b/.github/workflows/sglang.yml
@@ -0,0 +1,30 @@
+on:
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: 'Tag for the build'
+        required: true
+      base-image:
+        description: 'Base image from which to build'
+        required: true
+      builder-image:
+        description: 'Image to use to compile wheels, if different from the base image'
+        required: false
+  push:
+    paths:
+      - "sglang/**"
+      - ".github/workflows/sglang.yml"
+      - ".github/workflows/build.yml"
+
+
+jobs:
+  build:
+    uses: ./.github/workflows/build.yml
+    secrets: inherit
+    with:
+      image-name: sglang
+      folder: sglang
+      tag-suffix: ${{ inputs.tag || '386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1' }}
+      build-args: |
+        BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:es-actions-386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1'}}
+        ${{ inputs.base-image && 'BASE_IMAGE=' }}${{ inputs.base-image}}
diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml
index 93148a65..b93fbbae 100644
--- a/.github/workflows/torch-base.yml
+++ b/.github/workflows/torch-base.yml
@@ -35,11 +35,12 @@ jobs:
     secrets: inherit
     with:
       image-name: ${{ inputs.image-name }}
-      tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
+      tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }}
       builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-${{ matrix.os }}
       base-image: nvidia/cuda:${{ matrix.cuda }}-base-${{ matrix.os }}
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
       torchaudio-version: ${{ matrix.audio }}
+      additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }}
       cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml
index ca7134ed..e37a6c18 100644
--- a/.github/workflows/torch-extras.yml
+++ b/.github/workflows/torch-extras.yml
@@ -51,13 +51,17 @@ jobs:
   get-required-bases:
     name: Get Latest Required Base Images
     if: inputs.skip-bases-check != true
-    runs-on: ["self-hosted", "Linux"]
+    runs-on: [ cw ]
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
+    defaults:
+      run:
+        shell: bash
     permissions:
       packages: read
     outputs:
       bases-list: ${{ steps.choose-bases.outputs.list }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Check if torch-extras needs to be rebuilt from previous bases
diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml
index aee13052..ede0fdf0 100644
--- a/.github/workflows/torch-nccl.yml
+++ b/.github/workflows/torch-nccl.yml
@@ -43,11 +43,12 @@ jobs:
     secrets: inherit
     with:
       image-name: ${{ inputs.image-name }}
-      tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.image.cuda, matrix.image.os, matrix.image.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
-      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
-      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
+      tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.cuda, matrix.os, matrix.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }}
+      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
+      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
       torchaudio-version: ${{ matrix.audio }}
-      cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }}
+      additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }}
+      cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
index 063e40af..139d23d9 100644
--- a/.github/workflows/torch-nightly.yml
+++ b/.github/workflows/torch-nightly.yml
@@ -19,7 +19,11 @@ jobs:
   get-nightly-info:
     name:
       Get Nightly Info
-    runs-on: [ self-hosted, Linux ]
+    runs-on: [ cw ]
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0'
+    defaults:
+      run:
+        shell: bash
     outputs:
       pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }}
       triton-commit: ${{ steps.get-hash.outputs.triton-commit }}
@@ -89,13 +93,13 @@ jobs:
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-base.yml
-      filter: del(.include)
+      filter: 'del(.include) | .exclude |= . + [{"abi": "0"}]'
   get-nccl-config:
     name: Get torch:nccl Config
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-nccl.yml
-      filter: del(.include)
+      filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"abi": "0"}]'
 
   build-base:
     name: Build Nightly torch:base
@@ -115,7 +119,7 @@ jobs:
       torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
       torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
       torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
-      triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }}
+      additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }}
       cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
   build-nccl:
@@ -130,12 +134,12 @@ jobs:
     secrets: inherit
     with:
       image-name: nightly-torch
-      tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.image.cuda, matrix.image.os, matrix.image.nccl, needs.get-nightly-info.outputs.version-string ) }}
-      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
-      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
+      tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.cuda, matrix.os, matrix.nccl, needs.get-nightly-info.outputs.version-string ) }}
+      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
+      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
       torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
       torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
       torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
-      triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }}
-      cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }}
+      additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }}
+      cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml
index 36bdcc6d..938b4306 100644
--- a/.github/workflows/torch.yml
+++ b/.github/workflows/torch.yml
@@ -19,13 +19,9 @@ on:
       torchaudio-version:
         required: true
         type: string
-      triton-version:
+      additional-build-args:
         required: false
         type: string
-      cuda-arch-support:
-        required: false
-        type: string
-        default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
       image-name:
         required: false
         type: string
@@ -63,15 +59,10 @@ on:
         required: true
         description: "Tagged version number from pytorch/audio to build"
         type: string
-      triton-version:
-        required: false
-        description: "Tagged version number from openai/triton to build"
-        type: string
-      cuda-arch-support:
+      additional-build-args:
         required: false
-        description: "Space-separated list of CUDA architectures to support"
+        description: "Further --build-arg parameters for the build"
         type: string
-        default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
       image-name:
         required: false
         description: "Custom name under which to publish the resulting container"
@@ -99,8 +90,7 @@ jobs:
         BUILD_TORCH_VERSION=${{ inputs.torch-version }}
         BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }}
         BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }}
-        ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }}
-        ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }}
+        ${{ inputs.additional-build-args }}
   build-extras:
     name: Build torch-extras
     if: inputs.build-extras
diff --git a/sglang/Dockerfile b/sglang/Dockerfile
new file mode 100644
index 00000000..2103ca20
--- /dev/null
+++ b/sglang/Dockerfile
@@ -0,0 +1,28 @@
+# syntax=docker/dockerfile:1.2
+ARG BASE_IMAGE
+ARG BUILDER_IMAGE="${BASE_IMAGE}"
+
+FROM ${BUILDER_IMAGE} AS builder
+
+ARG BUILD_TORCH_CUDA_ARCH_LIST='8.0 8.6 8.9 9.0 10.0+PTX'
+
+ARG FLASHINFER_COMMIT='c04755e21f4d6fb7813c703f2b00a7ef012be9b8'
+ARG CUTLASS_COMMIT='b78588d1630aa6643bf021613717bafb705df4ef'
+ARG VLLM_COMMIT='5095e966069b9e65b7c4c63427e06cebacaad0a0'
+ARG SGLANG_COMMIT='4b6f62e2bc52a528551e9a21e7b0a4945c6115bb'
+ARG DECORD_COMMIT='d2e56190286ae394032a8141885f76d5372bd44b'
+# Building Triton is not currently enabled,
+# but this is the commit that would be used if it were
+ARG TRITON_COMMIT='1e0e51c4aeb3e1beea000da5d0e494f8b9ac40dd'
+
+WORKDIR /build
+COPY build.bash /build/
+RUN mkdir /wheels && \
+    bash build.bash -a "${BUILD_TORCH_CUDA_ARCH_LIST}" && \
+    rm -rf /build/*
+COPY install.bash /wheels/
+
+FROM ${BASE_IMAGE}
+RUN --mount=type=bind,from=builder,source=/wheels,target=/wheels \
+    cd /wheels && \
+    bash install.bash
diff --git a/sglang/build.bash b/sglang/build.bash
new file mode 100644
index 00000000..d72d7a37
--- /dev/null
+++ b/sglang/build.bash
@@ -0,0 +1,150 @@
+#!/bin/bash
+set -xeo pipefail
+export DEBIAN_FRONTEND=noninteractive
+
+TORCH_CUDA_ARCH_LIST=''
+FILTER_ARCHES=''
+BUILD_TRITON=''
+
+while getopts 'a:ft' OPT; do
+  case "${OPT}" in
+    a) TORCH_CUDA_ARCH_LIST="${OPTARG}" ;;
+    f) FILTER_ARCHES='1' ;;
+    t) BUILD_TRITON='1' ;;
+    *) exit 92 ;;
+  esac
+done
+
+export NVCC_APPEND_FLAGS='-gencode=arch=compute_100,code=[sm_100,compute_100] -gencode=arch=compute_100a,code=sm_100a --diag-suppress 174'
+export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0 10.0+PTX}"
+
+mkdir -p /wheels/logs
+
+_BUILD() { python3 -m build -w -n -v -o /wheels "${1:-.}"; }
+_LOG() { tee -a "/wheels/logs/${1:?}"; }
+_CONSTRAINTS="$(python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p')"
+_PIP_INSTALL() {
+  python3 -m pip install --no-cache-dir \
+  --constraint=/dev/stdin <<< "${_CONSTRAINTS}" \
+  "$@"
+}
+
+_PIP_INSTALL -U pip setuptools wheel build pybind11 ninja cmake
+
+# triton (not compatible with torch 2.6)
+if [ "${BUILD_TRITON}" = 1 ]; then (
+  : "${TRITON_COMMIT:?}"
+  echo 'Building triton-lang/triton'
+  git clone --recursive --filter=blob:none https://github.com/triton-lang/triton
+  cd triton
+  git checkout "${TRITON_COMMIT}"
+  _BUILD python |& _LOG triton.log
+); fi
+
+# flashinfer
+: "${FLASHINFER_COMMIT:?}"
+: "${CUTLASS_COMMIT:?}"
+(
+echo 'Building flashinfer-ai/flashinfer'
+git clone --recursive --filter=blob:none https://github.com/flashinfer-ai/flashinfer
+cd flashinfer
+git checkout "${FLASHINFER_COMMIT}"
+sed -i 's/name = "flashinfer-python"/name = "flashinfer"/' pyproject.toml
+git -C 3rdparty/cutlass checkout "${CUTLASS_COMMIT}"
+_PIP_INSTALL -U optree
+NVCC_APPEND_FLAGS="${NVCC_APPEND_FLAGS:+$NVCC_APPEND_FLAGS } --diag-suppress 20281,174" \
+  FLASHINFER_ENABLE_AOT=1 _BUILD . |& _LOG flashinfer.log
+)
+
+# Setup cutlass repo for vLLM to use
+git clone --recursive --filter=blob:none https://github.com/NVIDIA/cutlass
+git -C cutlass checkout "${CUTLASS_COMMIT}"
+
+# vLLM
+: "${VLLM_COMMIT:?}"
+(
+echo 'Building vllm-project/vllm'
+export VLLM_CUTLASS_SRC_DIR="${PWD}/cutlass"
+test -d "${VLLM_CUTLASS_SRC_DIR}"
+git clone --recursive --filter=blob:none https://github.com/vllm-project/vllm
+cd vllm
+git checkout "${VLLM_COMMIT}"
+# For lsmod
+apt-get -qq update && apt-get -qq install --no-install-recommends -y kmod
+python3 use_existing_torch.py
+_PIP_INSTALL -r requirements-build.txt
+USE_CUDNN=1 USE_CUSPARSELT=1 _BUILD . |& _LOG vllm.log
+)
+
+# sglang
+: "${SGLANG_COMMIT:?}"
+(
+echo 'Building sglang'
+git clone --recursive --filter=blob:none https://github.com/sgl-project/sglang
+cd sglang
+git checkout "${SGLANG_COMMIT}"
+(
+cd sgl-kernel
+git -C 3rdparty/cutlass checkout "${CUTLASS_COMMIT}"
+git -C 3rdparty/flashinfer/3rdparty/cutlass checkout "${CUTLASS_COMMIT}"
+
+ARCH_TRIPLE="$(gcc -print-multiarch)"
+LIB_DIR="/usr/lib/${ARCH_TRIPLE:?}"
+test -d "${LIB_DIR:?}"
+PYTHON_API_VER="$(
+  python3 --version | sed -En 's@Python ([0-9])\.([0-9]+)\..*@cp\1\2@p'
+)"
+ARCH_FILTER=()
+if [ "${FILTER_ARCHES}" = 1 ]; then
+  ARCH_FILTER=(-e 's@"-gencode=arch=compute_[78][0-9],code=sm_[78][0-9]",@#\0@')
+fi
+
+sed -Ei \
+  "${ARCH_FILTER[@]}" \
+  -e 's@/usr/lib/x86_64-linux-gnu@'"${LIB_DIR}"'@' \
+  -e 's@(\s+)(\w.+manylinux2014_x86_64.+)@\1pass  # \2@' \
+  -e 's@\{"py_limited_api": "cp39"}@{"py_limited_api": "'"${PYTHON_API_VER:-cp310}"'"}@' \
+  setup.py
+SGL_KERNEL_ENABLE_BF16=1 SGL_KERNEL_ENABLE_FP8=1 SGL_KERNEL_ENABLE_SM90A=1 \
+  _BUILD . |& _LOG sglang.log
+)
+_BUILD python |& _LOG sglang.log
+)
+
+# decord and xgrammar aren't available on PyPI for ARM64
+
+if [ ! "$(uname -m)" = 'x86_64' ]; then
+  # xgrammar (for sglang)
+  (
+  git clone --recursive --filter=blob:none -b v0.1.11 https://github.com/mlc-ai/xgrammar && \
+  cd xgrammar
+  (
+  mkdir build && cd build
+  cmake -S.. -B. -DCMAKE_BUILD_TYPE=Release -GNinja |& _LOG xgrammar.log
+  cmake --build . |& _LOG xgrammar.log
+  )
+  _BUILD python |& _LOG xgrammar.log
+  )
+
+  # decord (for sglang)
+  : "${DECORD_COMMIT:?}"
+  (
+  apt-get -qq update && apt-get -q install --no-install-recommends -y \
+    build-essential python3-dev python3-setuptools \
+    make cmake ffmpeg \
+    libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev
+  git clone --recursive --filter=blob:none https://github.com/dmlc/decord
+  cd decord
+  git checkout "${DECORD_COMMIT}"
+  (
+  mkdir build && cd build
+  cmake -S.. -B. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release -GNinja |& _LOG decord.log
+  cmake --build . |& _LOG decord.log
+  cp libdecord.so /wheels/libdecord.so
+  )
+  cd python
+  _BUILD . |& _LOG decord.log
+  )
+fi
+
+apt-get clean
diff --git a/sglang/install.bash b/sglang/install.bash
new file mode 100644
index 00000000..07c23b6b
--- /dev/null
+++ b/sglang/install.bash
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -xeo pipefail
+export DEBIAN_FRONTEND=noninteractive
+
+_CONSTRAINTS="$(
+  python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p'
+)"
+_PIP_INSTALL() {
+  python3 -m pip install --no-cache-dir \
+  --constraint=/dev/stdin <<< "${_CONSTRAINTS}" \
+  "$@"
+}
+
+_PIP_INSTALL /wheels/*.whl
+if [ -x /wheels/libdecord.so ]; then
+  apt-get -qq update && apt-get -q install --no-install-recommends -y \
+    libavfilter7 libavformat58 && \
+  apt-get clean
+  cp /wheels/libdecord.so /usr/local/lib/ && ldconfig
+fi
+
+SGLANG_EXTRA_PIP_DEPENDENCIES=()
+if [ "$(uname -m)" = 'x86_64' ]; then
+  SGLANG_EXTRA_PIP_DEPENDENCIES=('decord' 'xgrammar>=0.1.10')
+fi
+_PIP_INSTALL \
+  'aiohttp' 'fastapi' \
+  'hf_transfer' 'huggingface_hub' 'interegular' 'modelscope' \
+  'orjson' 'packaging' 'pillow' 'prometheus-client>=0.20.0' \
+  'psutil' 'pydantic' 'python-multipart' 'pyzmq>=25.1.2' \
+  'torchao>=0.7.0' 'uvicorn' 'uvloop' \
+  'cuda-python' 'outlines>=0.0.44,<0.1.0' \
+  "${SGLANG_EXTRA_PIP_DEPENDENCIES[@]}"
diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 3c785f0f..51346e0b 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -2,8 +2,11 @@
 
 ARG BASE_IMAGE
 ARG DEEPSPEED_VERSION="0.14.4"
-ARG APEX_COMMIT="23c1f86520e22b505e8fdfcf6298273dff2d93d8"
+ARG APEX_COMMIT="a1df80457ba67d60cbdb0d3ddfb08a2702c821a8"
+ARG DEEPSPEED_KERNELS_COMMIT="e77acc40b104696d4e73229b787d1ef29a9685b1"
+ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST="80;86;89;90"
 ARG XFORMERS_VERSION="0.0.28.post1"
+ARG BUILD_MAX_JOBS=""
 
 FROM alpine/git:2.36.3 as apex-downloader
 WORKDIR /git
@@ -16,6 +19,18 @@ RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \
       --depth 1 --filter=blob:none && \
     find -type d -name docs -prune -exec rm -r '{}' ';'
 
+
+FROM alpine/git:2.36.3 as ds-kernels-downloader
+WORKDIR /git
+ARG DEEPSPEED_KERNELS_COMMIT
+RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \
+      https://github.com/microsoft/DeepSpeed-Kernels ds-kernels && \
+    cd ds-kernels && \
+    git checkout "${DEEPSPEED_KERNELS_COMMIT}" && \
+    git submodule update --init --recursive --jobs 8 \
+      --depth 1 --filter=blob:none
+
+
 # Dependencies requiring NVCC are built ahead of time in a separate stage
 # so that the ~2 GiB dev library installations don't have to be included
 # in the final image.
@@ -32,7 +47,6 @@ RUN export \
       libcublas-dev-${CUDA_PACKAGE_VERSION} \
       libcusparse-dev-${CUDA_PACKAGE_VERSION} \
       libcusolver-dev-${CUDA_PACKAGE_VERSION} \
-      cuda-nvprof-${CUDA_PACKAGE_VERSION} \
       cuda-profiler-api-${CUDA_PACKAGE_VERSION} \
       cuda-nvtx-${CUDA_PACKAGE_VERSION} \
       cuda-nvrtc-dev-${CUDA_PACKAGE_VERSION} && \
@@ -58,18 +72,15 @@ RUN apt-get -qq update && apt-get -qq install -y \
 
 # Update compiler (GCC) and linker (LLD) versions
 # gfortran-11 is just for compiler_wrapper.f95
-RUN CODENAME="$(lsb_release -cs)" && \
-    wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \
-    apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \
-    apt-add-repository -y ppa:ubuntu-toolchain-r/test && \
+RUN LLVM_VERSION='18' && \
     apt-get -qq update && apt-get -qq install --no-install-recommends -y \
-      gcc-11 g++-11 gfortran-11 lld-17 && \
+      gcc-11 g++-11 gfortran-11 "lld-$LLVM_VERSION" && \
     apt-get clean && \
     update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \
     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \
     update-alternatives --install \
       /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \
-    update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1
+    update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1
 
 RUN mkdir /wheels /build
 WORKDIR /build
@@ -80,40 +91,69 @@ WORKDIR /build
 # The compiler wrapper normalizes -march=native to -march=skylake
 # along with a couple other transformations before invoking GCC.
 COPY compiler_wrapper.f95 .
-RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
+ARG AMD64_NATIVE_ARCH="skylake"
+ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres"
+RUN if [ "$(uname -m)" = "aarch64" ]; then \
+      NATIVE="WRAPPER_NATIVE=\"${ARM64_NATIVE_ARCH}\"" && \
+      AVX='WRAPPER_NO_AVX'; \
+    else \
+      NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \
+      AVX='WRAPPER_AVX="AVX256"'; \
+    fi && \
+    gfortran -ffree-line-length-512 -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
 
 COPY --chmod=755 effective_cpu_count.sh .
 COPY --chmod=755 scale.sh .
 
+ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=compute_90a"
+RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \
+    case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \
+      FLAGS="${FLAGS} -gencode=arch=compute_100a,code=sm_100a" ;; \
+    esac && \
+    echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf
+ARG BUILD_MAX_JOBS
+
 
 FROM builder-base as deepspeed-builder
+
+ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST
+RUN --mount=type=bind,from=ds-kernels-downloader,source=/git/ds-kernels,target=ds-kernels/,rw \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    cd ds-kernels && \
+    export CUDA_ARCH_LIST="${DEEPSPEED_KERNELS_CUDA_ARCH_LIST}" && \
+    echo "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}" && \
+    python3 -m pip wheel -w /wheels \
+      --no-cache-dir --no-build-isolation --no-deps . && \
+    python3 -m pip install /wheels/*.whl
+
 # DeepSpeed build flags
 # See: https://www.deepspeed.ai/tutorials/advanced-install
-ARG DS_BUILD_OPS="1"
+ARG DS_BUILD_OPS="0"
 ARG DS_BUILD_CCL_COMM="0"
-ARG DS_BUILD_CPU_ADAM=""
-ARG DS_BUILD_CPU_LION=""
+ARG DS_BUILD_CPU_ADAM="1"
+ARG DS_BUILD_CPU_LION="1"
 # Requires CUTLASS
 ARG DS_BUILD_EVOFORMER_ATTN="0"
-ARG DS_BUILD_FUSED_ADAM=""
-ARG DS_BUILD_FUSED_LION=""
-ARG DS_BUILD_CPU_ADAGRAD=""
-ARG DS_BUILD_FUSED_LAMB=""
-ARG DS_BUILD_QUANTIZER=""
-ARG DS_BUILD_RANDOM_LTD=""
+ARG DS_BUILD_FUSED_ADAM="1"
+ARG DS_BUILD_FUSED_LION="1"
+ARG DS_BUILD_CPU_ADAGRAD="1"
+ARG DS_BUILD_FUSED_LAMB="1"
+ARG DS_BUILD_QUANTIZER="1"
+ARG DS_BUILD_RANDOM_LTD="1"
 # sparse_attn has issues with PyTorch >= 2.0.0 as of DeepSpeed 0.9.4
 ARG DS_BUILD_SPARSE_ATTN="0"
-ARG DS_BUILD_TRANSFORMER=""
-ARG DS_BUILD_TRANSFORMER_INFERENCE=""
-ARG DS_BUILD_STOCHASTIC_TRANSFORMER=""
-ARG DS_BUILD_UTILS=""
-ARG DS_BUILD_AIO=""
+ARG DS_BUILD_TRANSFORMER="1"
+ARG DS_BUILD_TRANSFORMER_INFERENCE="1"
+ARG DS_BUILD_STOCHASTIC_TRANSFORMER="1"
+ARG DS_BUILD_UTILS="1"
+ARG DS_BUILD_AIO="1"
 
 ARG DEEPSPEED_VERSION
 
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
-RUN python3 -m pip install -U --no-cache-dir \
-      setuptools wheel pip deepspeed-kernels && \
+RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    python3 -m pip install -U --no-cache-dir \
+      setuptools wheel pip py-cpuinfo && \
     if python3 -m pip show torch | grep 'Version: 2\.[1-9]' > /dev/null; then \
       # DeepSpeed's AIO extension is incompatible with PyTorch 2.1.x's
       # requirement for C++17 (as of DeepSpeed 0.10.1).
@@ -144,9 +184,9 @@ RUN python3 -m pip install -U --no-cache-dir \
       do if [[ -z ${!VAR} ]]; then unset ${VAR}; fi; done; \
     } && \
     CC=$(realpath -e ./compiler) \
-      MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)" \
+      MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)}" \
       python3 -m pip wheel -w /wheels \
-      --no-cache-dir --no-build-isolation --no-deps \
+      --no-cache-dir --no-build-isolation --no-deps -v \
       deepspeed==${DEEPSPEED_VERSION} && \
     rm ./*
 SHELL ["/bin/sh", "-c"]
@@ -164,6 +204,7 @@ RUN LIBNCCL2_VERSION=$(dpkg-query --showformat='${Version}' --show libnccl2) &&
 # --distributed_adam, --distributed_lamb, and --group_norm aren't documented
 # in the Apex README, but are defined in its setup.py config.
 RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
     python3 -m pip install -U --no-cache-dir \
       packaging setuptools wheel pip && \
     CUDA_MAJOR_VERSION=$(echo "${CUDA_VERSION}" | cut -d. -f1) && \
@@ -178,8 +219,7 @@ RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \
       :; \
     )" && \
     export CC=$(realpath -e ./compiler) && \
-    export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 24)" && \
-    export NVCC_APPEND_FLAGS='-diag-suppress 186,177' && \
+    export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 24)}" && \
     printf -- '--config-settings="--build-option=%s" ' $( \
       echo \
         --cpp_ext \
@@ -218,12 +258,12 @@ FROM builder-base as xformers-builder
 ARG XFORMERS_VERSION
 
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
-RUN python3 -m pip install -U --no-cache-dir \
+RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    python3 -m pip install -U --no-cache-dir \
       setuptools wheel pip && \
     CC=$(realpath -e ./compiler) \
       MAX_JOBS=1 \
       PYTHONUNBUFFERED=1 \
-      NVCC_APPEND_FLAGS='-diag-suppress 186,177' \
       XFORMERS_DISABLE_FLASH_ATTN=1 \
       python3 -m pip wheel -w /wheels -v \
       --no-cache-dir --no-build-isolation --no-deps \
diff --git a/torch-extras/compiler_wrapper.f95 b/torch-extras/compiler_wrapper.f95
index f8c13bd2..cbdc602e 100644
--- a/torch-extras/compiler_wrapper.f95
+++ b/torch-extras/compiler_wrapper.f95
@@ -1,13 +1,25 @@
+#ifndef WRAPPER_NATIVE
+#define WRAPPER_NATIVE "skylake"
+#endif
+
+#ifndef WRAPPER_CC
+#define WRAPPER_CC "gcc"
+#endif
+
+#ifndef WRAPPER_AVX
+#define WRAPPER_AVX "AVX256"
+#endif
+
 PROGRAM compiler_wrapper
-    ! Wraps GCC invocations,
-    ! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions
-    ! with -D__AVX256__, and -march=native with -march=skylake,
+    ! Wraps C compiler invocations,
+    ! replacing -D__AVX512__, -D__AVX256__, and -D__SCALAR__ preprocessor definitions
+    ! with -D__<WRAPPER_AVX>__, and -march=native with -march=<WRAPPER_NATIVE>,
     ! for better reproducibility and compatibility.
     IMPLICIT NONE
     INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0
     CHARACTER(len=:), ALLOCATABLE :: arg, command
     ALLOCATE(CHARACTER(len=128) :: arg)
-    command = "gcc"
+    command = WRAPPER_CC
 
     DO i = 1, COMMAND_ARGUMENT_COUNT()
         DO
@@ -22,9 +34,15 @@ PROGRAM compiler_wrapper
             END IF
         END DO
         IF (arg == "-march=native") THEN
-            command = command // " '-march=skylake'"
-        ELSE IF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN
-            command = command // " '-D__AVX256__'"
+            command = command // (" '-march=" // WRAPPER_NATIVE // "'")
+        ELSE IF ( &
+            arg == "-D__AVX512__" &
+            .OR. arg == "-D__AVX256__" &
+            .OR. arg == "-D__SCALAR__" &
+        ) THEN
+#ifndef WRAPPER_NO_AVX
+            command = command // (" '-D__" // WRAPPER_AVX // "__'")
+#endif
         ELSE
             command = command // shell_escaped(arg)
         END IF
diff --git a/torch/Dockerfile b/torch/Dockerfile
index 6705427d..e070232a 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -1,15 +1,17 @@
-# syntax=docker/dockerfile:1.4
-ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.4.1-devel-ubuntu22.04"
-ARG FINAL_BASE_IMAGE="nvidia/cuda:12.4.1-base-ubuntu22.04"
+# syntax=docker/dockerfile:1.7
+ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.8.0-devel-ubuntu22.04"
+ARG FINAL_BASE_IMAGE="nvidia/cuda:12.8.0-base-ubuntu22.04"
 
-ARG BUILD_TORCH_VERSION="2.5.0"
+ARG BUILD_TORCH_VERSION="2.5.1"
 ARG BUILD_TORCH_VISION_VERSION="0.20.0"
 ARG BUILD_TORCH_AUDIO_VERSION="2.5.0"
-ARG BUILD_TRANSFORMERENGINE_VERSION="1.11"
-ARG BUILD_FLASH_ATTN_VERSION="2.6.3"
+ARG BUILD_TRANSFORMERENGINE_VERSION="1.13"
+ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1"
+ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1"
 ARG BUILD_TRITON_VERSION=""
 ARG BUILD_TRITON="1"
-ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX"
+ARG BUILD_TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0 10.0+PTX"
+ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST="70;80;89;90;100"
 
 # 8.7 is supported in the PyTorch main branch, but not 2.0.0
 
@@ -19,42 +21,47 @@ ARG AOCL_URL="https://download.amd.com/developer/eula/aocl/aocl-4-2/aocl-linux-a
 
 # Clone PyTorch repositories independently from all other build steps
 # for cache-friendliness and parallelization
-FROM alpine/git:2.40.1 as downloader-base
+FROM alpine/git:2.40.1 AS downloader-base
 WORKDIR /git
 RUN git config --global advice.detachedHead false
 
 COPY <<-"EOT" /git/clone.sh
-    #!/bin/sh
-    REPO="https://github.com/$1";
-    DEST="$2";
-    REF="$3";
-
-    CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; };
-
-    # Try cloning REF as a tag prefixed with "v", otherwise fall back
-    # to git checkout for commit hashes
-    CLONE --recurse-submodules --shallow-submodules --also-filter-submodules --no-tags \
-      "$REPO" -b "v$REF" "$DEST" || { \
-        CLONE --no-single-branch --no-checkout "$REPO" "$DEST" && \
-        git -C "$DEST" checkout "$REF" && \
-        git -C "$DEST" submodule update --init --filter=blob:none --depth=1 --recursive --jobs 8; \
-    };
+	#!/bin/sh
+	REPO="https://github.com/$1";
+	DEST="$2";
+	REF="$3";
+
+	CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; };
+
+	# Try cloning REF as a tag prefixed with "v", otherwise fall back
+	# to git checkout for commit hashes
+	CLONE --recurse-submodules --shallow-submodules --also-filter-submodules --no-tags \
+	  "$REPO" -b "v$REF" "$DEST" || { \
+	    CLONE --no-single-branch --no-checkout "$REPO" "$DEST" && \
+	    git -C "$DEST" checkout "$REF" && \
+	    git -C "$DEST" submodule update --init --filter=blob:none --depth=1 --recursive --jobs 8; \
+	};
 EOT
 
 RUN chmod 755 /git/clone.sh
 
 
-FROM downloader-base as pytorch-downloader
+FROM downloader-base AS pytorch-downloader
 ARG BUILD_TORCH_VERSION
+# Includes a patch for a foreach bug in PyTorch v2.5.1
 RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \
+    if [ "${BUILD_TORCH_VERSION}" = '2.5.1' ]; then \
+      wget 'https://github.com/pytorch/pytorch/commit/1cdaf1d85f5e4b3f8952fd0737a1afeb16995d13.patch' -qO- \
+      | git -C pytorch apply; \
+    fi && \
     rm -rf pytorch/.git
 
-FROM downloader-base as torchvision-downloader
+FROM downloader-base AS torchvision-downloader
 ARG BUILD_TORCH_VISION_VERSION
 RUN ./clone.sh pytorch/vision vision "${BUILD_TORCH_VISION_VERSION}" && \
     rm -rf vision/.git
 
-FROM downloader-base as torchaudio-downloader
+FROM downloader-base AS torchaudio-downloader
 ARG BUILD_TORCH_AUDIO_VERSION
 RUN ./clone.sh pytorch/audio audio "${BUILD_TORCH_AUDIO_VERSION}"
 # The torchaudio build requires that this directory remain a full git repository,
@@ -70,22 +77,23 @@ RUN if grep -qF '#include <float.h>' \
     fi && \
     rm /git/patch
 
-FROM downloader-base as transformerengine-downloader
+FROM downloader-base AS transformerengine-downloader
 ARG BUILD_TRANSFORMERENGINE_VERSION
 RUN ./clone.sh NVIDIA/TransformerEngine TransformerEngine "${BUILD_TRANSFORMERENGINE_VERSION}"
 
-# Include a patch commit that is sort-of part of v1.11 but isn't in their v1.11 release git tag
-# See https://github.com/NVIDIA/TransformerEngine/pull/1222
-RUN if [ "${BUILD_TRANSFORMERENGINE_VERSION}" = '1.11' ]; then \
-      wget 'https://github.com/NVIDIA/TransformerEngine/commit/fc034785f5e3a5bc5600a88766d9a1d75137ce77.patch' -qO- \
-      | git -C TransformerEngine apply -v --stat --apply -; \
-    fi
-
-FROM downloader-base as flash-attn-downloader
+FROM downloader-base AS flash-attn-downloader
 ARG BUILD_FLASH_ATTN_VERSION
 RUN ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_VERSION}"
 
-FROM downloader-base as triton-version
+FROM downloader-base AS flash-attn-3-downloader
+ARG BUILD_FLASH_ATTN_3_VERSION
+RUN if [ -n "$BUILD_FLASH_ATTN_3_VERSION" ]; then \
+      ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_3_VERSION}"; \
+    else \
+      mkdir flash-attention; \
+    fi
+
+FROM downloader-base AS triton-version
 ENV TRITON_COMMIT_FILE='.ci/docker/ci_commit_pins/triton.txt'
 COPY --link --from=pytorch-downloader "/git/pytorch/${TRITON_COMMIT_FILE}" /git/version.txt
 ARG BUILD_TRITON_VERSION
@@ -93,7 +101,7 @@ RUN if [ -n "${BUILD_TRITON_VERSION}" ]; then \
       echo "${BUILD_TRITON_VERSION}" > /git/version.txt; \
     fi
 
-FROM downloader-base as triton-downloader
+FROM downloader-base AS triton-downloader
 COPY --link --from=triton-version /git/version.txt /git/version.txt
 ARG BUILD_TRITON
 RUN if [ "${BUILD_TRITON}" = '1' ]; then \
@@ -102,7 +110,7 @@ RUN if [ "${BUILD_TRITON}" = '1' ]; then \
       mkdir triton; \
     fi
 
-FROM alpine/curl:8.7.1 as aocl-downloader
+FROM alpine/curl:8.7.1 AS aocl-downloader
 WORKDIR /tmp/install
 
 RUN apk add --no-cache bash
@@ -128,7 +136,7 @@ RUN curl -sSfo- "${AOCL_URL}" | tar xzf - --strip-components 1 && \
 
 
 ## Build PyTorch on a builder image.
-FROM ${BUILDER_BASE_IMAGE} as builder-base
+FROM ${BUILDER_BASE_IMAGE} AS builder-base-shared
 ENV DEBIAN_FRONTEND=noninteractive
 
 ARG BUILD_CCACHE_SIZE="1Gi"
@@ -142,7 +150,7 @@ RUN apt-get -qq update && apt-get -qq install -y \
     /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
-    ln -s libomp.so.5 /usr/lib/x86_64-linux-gnu/libomp.so && \
+    ln -s libomp.so.5 "/usr/lib/$(gcc -print-multiarch)/libomp.so" && \
     ldconfig
 
 COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh
@@ -160,12 +168,24 @@ RUN export \
     rm /tmp/install_cudnn.sh && \
     apt-get clean
 
+# Add Kitware's apt repository to get a newer version of CMake
+RUN apt-get -qq update && apt-get -qq install -y \
+      software-properties-common lsb-release && \
+    { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+    | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \
+    apt-add-repository -n "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
+    apt-get -qq update && apt-get -qq install -y cmake && apt-get clean
+
 RUN mkdir /tmp/ccache-install && \
     cd /tmp/ccache-install && \
-    CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2-linux-x86_64.tar.xz' && \
-    wget -qO - $CCACHE_URL | tar --strip-components 1 -xJf - && \
+    CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2.tar.xz' && \
+    wget -qO - "$CCACHE_URL" | tar --strip-components 1 -xJf - && \
+    mkdir build && \
+    cd build && \
+    cmake -B. -S.. -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build . --config Release && \
     make install && \
-    cd .. && \
+    cd ../.. && \
     rm -rf /tmp/ccache-install && \
     ccache -M "${BUILD_CCACHE_SIZE}" && \
     ccache -F 0
@@ -176,30 +196,35 @@ ENV CCACHE_DIR=/ccache \
     CMAKE_CXX_COMPILER_LAUNCHER=ccache \
     CMAKE_CUDA_COMPILER_LAUNCHER=ccache
 
-# Add Kitware's apt repository to get a newer version of CMake
-RUN apt-get -qq update && apt-get -qq install -y \
-      software-properties-common lsb-release && \
-    { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \
-    | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \
-    apt-add-repository -n "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
-    apt-get -qq update && apt-get -qq install -y cmake && apt-get clean
-
 # Update compiler (GCC) and linker (LLD) versions
-RUN CODENAME="$(lsb_release -cs)" && \
+RUN LLVM_VERSION='18' && \
+    CODENAME="$(lsb_release -cs)" && \
     wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \
-    apt-add-repository -n "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \
+    apt-add-repository -n "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-$LLVM_VERSION main" && \
     SETUP_TOOLCHAIN() { \
         apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \
         | sed -e '/connection timed out/{p; Q1}' && \
-        apt-get -qq install --no-install-recommends -y gcc-11 g++-11 gfortran-11 lld-17 && \
+        apt-get -qq install --no-install-recommends -y \
+          gcc-11 g++-11 gfortran-11 \
+          "lld-$LLVM_VERSION" "libomp-$LLVM_VERSION-dev" && \
         apt-get clean; \
     } && \
     { SETUP_TOOLCHAIN || { sleep "$(shuf -i10-20 -n1)" && SETUP_TOOLCHAIN; }; } && \
     update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \
     update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \
     update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \
-    update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1
+    if [ "$(uname -m)" != 'aarch64' ]; then \
+      update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1; \
+    fi && \
+    ldconfig
+
+
+FROM builder-base-shared AS builder-base-arm64
+# There is currently no CPU BLAS used for ARM builds,
+# so this stage is just an alias
+
 
+FROM builder-base-shared AS builder-base-amd64
 # Install AOCL-BLAS and AOCL-LAPACK
 # See: https://www.amd.com/en/developer/aocl/dense.html
 ARG AOCL_BASE
@@ -231,39 +256,50 @@ ENV C_INCLUDE_PATH="${AOCL_BASE}/include${C_INCLUDE_PATH:+:$C_INCLUDE_PATH}" \
     LD_LIBRARY_PATH="${AOCL_BASE}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" \
     LIBRARY_PATH="${AOCL_BASE}/lib${LIBRARY_PATH:+:$LIBRARY_PATH}"
 
+
+FROM builder-base-${TARGETARCH} AS builder-base
 RUN mkdir /build /build/dist
 WORKDIR /build
 COPY --chmod=755 effective_cpu_count.sh .
 COPY --chmod=755 scale.sh .
 COPY compiler_wrapper.f95 .
-RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
+ARG AMD64_NATIVE_ARCH="skylake"
+ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres"
+RUN if [ "$(uname -m)" = "aarch64" ]; then \
+      NATIVE="WRAPPER_NATIVE=\"${ARM64_NATIVE_ARCH}\"" && \
+      AVX='WRAPPER_NO_AVX'; \
+    else \
+      NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \
+      AVX='WRAPPER_AVX="AVX256"'; \
+    fi && \
+    gfortran -ffree-line-length-512 -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95
 
 COPY <<-"EOT" /build/version-string.sh
-    #!/bin/sh
-    set -e;
-    VERSION="$1";
-
-    IS_HASH() {
-      echo "$1" | grep -qxiEe '[0-9a-f]{40}';
-    };
-
-    if IS_HASH "$VERSION"; then
-      REAL_VERSION="$(cat ./version.txt)";
-      SHORT_HASH="$(echo "$VERSION" | cut -c1-7)";
-      echo "$REAL_VERSION+$SHORT_HASH";
-    else
-      echo "$VERSION";
-    fi;
+	#!/bin/sh
+	set -e;
+	VERSION="$1";
+
+	IS_HASH() {
+	  echo "$1" | grep -qxiEe '[0-9a-f]{40}';
+	};
+
+	if IS_HASH "$VERSION"; then
+	  REAL_VERSION="$(cat ./version.txt)";
+	  SHORT_HASH="$(echo "$VERSION" | cut -c1-7)";
+	  echo "$REAL_VERSION+$SHORT_HASH";
+	else
+	  echo "$VERSION";
+	fi;
 EOT
 RUN chmod 755 /build/version-string.sh
 
 COPY <<-"EOT" /build/storage-info.sh
-    #!/bin/sh
-    set -e;
-    TARGET="$(realpath "$1")";
+	#!/bin/sh
+	set -e;
+	TARGET="$(realpath "$1")";
 
-    STORAGE_INFO="$(df -h '--output=fstype,used,avail,pcent,target' "$TARGET")" || exit 0;
-    printf 'Storage info for %s:\n%s\n' "$TARGET" "$STORAGE_INFO";
+	STORAGE_INFO="$(df -h '--output=fstype,used,avail,pcent,target' "$TARGET")" || exit 0;
+	printf 'Storage info for %s:\n%s\n' "$TARGET" "$STORAGE_INFO";
 EOT
 RUN chmod 755 /build/storage-info.sh
 
@@ -280,10 +316,12 @@ ENV CMAKE_PREFIX_PATH=/usr/bin/ \
     CUDNN_LIB_DIR=/usr/local/cuda/lib64
 
 ARG BUILD_TRITON
+ARG BUILD_MAX_JOBS=""
 RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \
     --mount=type=cache,target=/ccache \
     if [ "$BUILD_TRITON" = '1' ]; then \
-      export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
+      pip3 install --no-cache-dir pybind11 && \
+      export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
       cd triton/python && \
       python3 -m pip wheel -w ../../dist/ --no-build-isolation --no-deps -vv . && \
       pip3 install ../../dist/*.whl; \
@@ -292,7 +330,19 @@ RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,r
 ARG BUILD_TORCH_VERSION
 ARG BUILD_TORCH_CUDA_ARCH_LIST
 ENV TORCH_VERSION=$BUILD_TORCH_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
+# Filter out the 10.0 arch on CUDA versions != 12.8
+ENV TORCH_CUDA_ARCH_LIST="${CUDA_VERSION##12.8.*}||${BUILD_TORCH_CUDA_ARCH_LIST/ 10.0/}||${BUILD_TORCH_CUDA_ARCH_LIST}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#||*||}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST%||*}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}"
+
+ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a"
+# Add sm_100a build if NV_CUDA_LIB_VERSION matches 12.[89].*
+RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \
+    case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \
+      FLAGS="${FLAGS} -gencode=arch=compute_100a,code=sm_100a" ;; \
+    esac && \
+    echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf
 
 # If the directory /opt/nccl-tests exists,
 # the base image is assumed to be nccl-tests,
@@ -320,16 +370,26 @@ ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
 # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271).
 # Without WITH_BLAS, it would detect the BLAS implementation as
 # BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either.
+ARG BUILD_CXX11_ABI=""
+SHELL ["/bin/bash", "-eo", "pipefail", "-c"]
 RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \
     --mount=type=cache,target=/ccache \
-    export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
+    export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
     echo "MAX_JOBS: ${MAX_JOBS}" && \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
+    if [ -n "${BUILD_CXX11_ABI}" ]; then \
+      export _GLIBCXX_USE_CXX11_ABI="${BUILD_CXX11_ABI}"; \
+    fi && \
     ./storage-info.sh . && \
     cd pytorch && \
     ../storage-info.sh . && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
     ln -s /usr/bin/c++ build/c++ && \
+    if [ "$(uname -m)" = 'aarch64' ]; then \
+      export USE_PRIORITIZED_TEXT_FOR_LD=1; \
+    fi && \
     { if [ -d /opt/nccl-tests ]; then \
       export \
         USE_DISTRIBUTED=1 \
@@ -350,16 +410,16 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
     WITH_BLAS=FLAME \
     PYTORCH_BUILD_VERSION="$(../version-string.sh "$TORCH_VERSION")" \
     PYTORCH_BUILD_NUMBER=0 \
-    TORCH_NVCC_FLAGS="-Xfatbin -compress-all -diag-suppress 191,186,177" \
-    python3 setup.py bdist_wheel --dist-dir ../dist
+    TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
+    python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \
+    | grep -Ev --line-buffered '^(ptxas /tmp/|copying .+/|creating build/)'
+SHELL ["/bin/sh", "-c"]
 RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl
 
-ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177"
-
 RUN python3 -m pip install -U --no-cache-dir \
       packaging setuptools wheel pip
 
-FROM builder-base as torchvision-builder
+FROM builder-base AS torchvision-builder
 RUN rm ./dist/*
 
 ## Build torchvision
@@ -370,7 +430,10 @@ RUN pip3 install --no-cache-dir --upgrade \
 
 RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=vision/,rw \
     --mount=type=cache,target=/ccache \
-    export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
+    export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
+    echo "MAX_JOBS: ${MAX_JOBS}" && \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
     cd vision && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
@@ -396,7 +459,7 @@ RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=visi
     TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     python3 setup.py bdist_wheel --dist-dir ../dist
 
-FROM builder-base as torchaudio-builder
+FROM builder-base AS torchaudio-builder
 RUN rm ./dist/*
 
 ## Build torchaudio
@@ -407,7 +470,10 @@ RUN pip3 install --no-cache-dir --upgrade \
 
 RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/,rw \
     --mount=type=cache,target=/ccache \
-    export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \
+    export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \
+    echo "MAX_JOBS: ${MAX_JOBS}" && \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
     cd audio && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
@@ -433,12 +499,23 @@ RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/
     TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     python3 setup.py bdist_wheel --dist-dir ../dist
 
-FROM builder-base as transformerengine-builder
+FROM builder-base AS transformerengine-builder
 RUN rm ./dist/*
 
 # Build TransformerEngine
+ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST
+ENV NVTE_CUDA_ARCHS=$BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST
+
 RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerEngine,target=TransformerEngine/,rw \
+    --mount=type=cache,target=/ccache \
     export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \
+    export MAX_JOBS="${BUILD_MAX_JOBS:-$MAX_JOBS}" && \
+    echo "MAX_JOBS: ${MAX_JOBS}" && \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
+    case "${CUDA_VERSION}" in 12.[0123456].*) \
+      export NVTE_CUDA_ARCHS="${NVTE_CUDA_ARCHS%;100*}" ;; \
+    esac && \
     cd TransformerEngine && \
     if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \
       sed -i "s/from functools import cache/from functools import lru_cache as cache/g" \
@@ -446,40 +523,67 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE
     fi && \
     python3 setup.py bdist_wheel --dist-dir /build/dist
 
-FROM builder-base as flash-attn-builder
+FROM builder-base AS flash-attn-builder-base
 RUN rm ./dist/*
+ENV PYTHONUNBUFFERED=1
+ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
+ARG BUILD_FLASH_ATTN_MAX_JOBS=""
+
+COPY <<-"EOT" /build/fa-build.sh
+	#!/bin/bash
+	set -eo pipefail;
+	if [ -n "$1" ]; then cd "$1"; fi;
+	python3 setup.py bdist_wheel --dist-dir /build/dist \
+	| grep -Ev --line-buffered '^ptxas (/tmp/|(info|warning)\s*:)|bytes spill stores'
+EOT
+RUN chmod 755 /build/fa-build.sh
+
+FROM flash-attn-builder-base AS flash-attn-builder
 
-SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+# Build flash-attn
 RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \
+    --mount=type=cache,target=/ccache \
     export CC=$(realpath -e ./compiler) \
-      MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)" \
-      PYTHONUNBUFFERED=1 \
-      FLASH_ATTENTION_FORCE_BUILD='TRUE' && \
+      MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)}" && \
+    echo "MAX_JOBS: ${MAX_JOBS}" && \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
     cd flash-attention && \
-    ( \
-      for EXT_DIR in $(realpath -s -e \
-        . \
-        csrc/ft_attention \
-        csrc/fused_dense_lib \
-        csrc/fused_softmax \
-        csrc/layer_norm \
-        csrc/rotary \
-        csrc/xentropy); \
-      do \
-          cd $EXT_DIR && \
-          python3 setup.py bdist_wheel --dist-dir /build/dist && \
-          cd - || \
-          exit 1; \
-      done; \
-    ) | \
-    grep -Ev --line-buffered 'ptxas info\s*:|bytes spill stores'
-SHELL ["/bin/sh", "-c"]
+    for EXT_DIR in $(realpath -s -e \
+      . \
+      csrc/ft_attention \
+      csrc/fused_dense_lib \
+      csrc/fused_softmax \
+      csrc/layer_norm \
+      csrc/rotary \
+      csrc/xentropy); \
+    do /build/fa-build.sh "$EXT_DIR" || exit 1; done
+
+FROM flash-attn-builder-base AS flash-attn-3-builder
+
+# Artifically sequence this build stage after the previous one
+# to prevent parallelism, because these are both very resource-intensive
+RUN --mount=type=bind,from=flash-attn-builder,source=/build,target=/build :
+
+# Build flash-attn v3
+RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,target=flash-attention/,rw \
+    --mount=type=cache,target=/ccache \
+    if [ ! -d flash-attention/hopper ]; then \
+      echo "Not compiling flash-attn v3" && exit 0; \
+    fi && \
+    export CC=$(realpath -e ./compiler) \
+      MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 10 6)}" && \
+    echo "MAX_JOBS: ${MAX_JOBS}" && \
+    export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \
+    echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \
+    /build/fa-build.sh flash-attention/hopper
 
-FROM builder-base as builder
+FROM builder-base AS builder
 COPY --link --from=torchaudio-builder /build/dist/ /build/dist/
 COPY --link --from=torchvision-builder /build/dist/ /build/dist/
 COPY --link --from=transformerengine-builder /build/dist/ /build/dist/
 COPY --link --from=flash-attn-builder /build/dist/ /build/dist/
+COPY --link --from=flash-attn-3-builder /build/dist/ /build/dist/
 
 ## Build the final torch image.
 FROM ${FINAL_BASE_IMAGE}
@@ -496,11 +600,11 @@ RUN apt-get -qq update && apt-get -qq install -y \
     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
     update-alternatives --install /usr/bin/vim vim /usr/bin/vim.tiny 1 && \
-    ln -s libomp.so.5 /usr/lib/x86_64-linux-gnu/libomp.so && \
+    ln -s libomp.so.5 "/usr/lib/$(gcc -print-multiarch)/libomp.so" && \
     ldconfig
 
 RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \
-        software-properties-common && \
+        software-properties-common lsb-release && \
     SETUP_LIBSTDCXX() { \
         apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \
         | sed -e '/connection timed out/{p; Q1}' && \
@@ -509,6 +613,13 @@ RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \
     } && \
     { SETUP_LIBSTDCXX || { sleep "$(shuf -i10-20 -n1)" && SETUP_LIBSTDCXX; }; }
 
+RUN LLVM_VERSION='18' && \
+    CODENAME="$(lsb_release -cs)" && \
+    wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \
+    apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-$LLVM_VERSION main" && \
+    apt-get -qq install -y --no-install-recommends "libomp5-$LLVM_VERSION" && \
+    apt-get clean
+
 # Install AOCL-BLAS and AOCL-LAPACK
 # See: https://www.amd.com/en/developer/aocl/dense.html
 ARG AOCL_BASE
@@ -525,7 +636,11 @@ ARG BUILD_TORCH_CUDA_ARCH_LIST
 ENV TORCH_VERSION=$BUILD_TORCH_VERSION
 ENV TORCH_VISION_VERSION=$BUILD_TORCH_VISION_VERSION
 ENV TORCH_AUDIO_VERSION=$BUILD_TORCH_AUDIO_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
+# Filter out the 10.0 arch on CUDA versions != 12.8
+ENV TORCH_CUDA_ARCH_LIST="${CUDA_VERSION##12.8.*}||${BUILD_TORCH_CUDA_ARCH_LIST/ 10.0/}||${BUILD_TORCH_CUDA_ARCH_LIST}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#||*||}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST%||*}"
+ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}"
 
 COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh
 # - libnvjitlink-X-Y only exists for CUDA versions >= 12-0.
@@ -564,3 +679,51 @@ WORKDIR /usr/src/app
 RUN --mount=type=bind,from=builder,source=/build/dist,target=. \
     pip3 install --no-cache-dir -U numpy packaging && \
     pip3 install --no-cache-dir -U ./*.whl
+
+# Make a symlink to flash-attn v3 where TransformerEngine expects it,
+# and modify the installation record so that pip uninstall knows how to
+# fully remove it.
+RUN <<-"EOT" python3
+	#!/bin/env python3
+	from base64 import urlsafe_b64encode as b64
+	from hashlib import sha256
+	from importlib import metadata
+	from pathlib import Path
+	from py_compile import compile
+
+	dist = metadata.distribution("flashattn-hopper")
+	p = dist.locate_file("flash_attn_interface.py")
+	print("flash_attn_interface:", p)
+	root = p.parent
+
+	if not p.exists():
+	    raise SystemExit("flash_attn_interface not found")
+	if not p.is_file():
+	    raise SystemExit("flash_attn_interface path is not a file")
+
+	d = root / "flashattn_hopper"
+	if d.exists():
+	    raise SystemExit(f'"{d}" already exists')
+
+	d.mkdir(mode=0o755, parents=False, exist_ok=False)
+	new = d / p.name
+	new.symlink_to(p)
+	print(f"Created new symlink at {new}")
+
+	compiled = Path(compile(new))
+
+
+	def record_entry(path: Path) -> str:
+	    content = path.read_bytes()
+	    digest = b64(sha256(content).digest()).rstrip(b"=").decode()
+	    package_path = path.relative_to(root).as_posix()
+	    return f"{package_path},sha256={digest},{len(content):d}\r\n"
+
+
+	for f in dist.files:
+	    if f.match("flashattn?hopper-*.dist-info/RECORD"):
+	        with f.locate().open("a", encoding="utf-8", newline="") as record:
+	            for added in (new, compiled):
+	                record.write(record_entry(added))
+	        break
+EOT
diff --git a/torch/compiler_wrapper.f95 b/torch/compiler_wrapper.f95
index f8c13bd2..cbdc602e 100644
--- a/torch/compiler_wrapper.f95
+++ b/torch/compiler_wrapper.f95
@@ -1,13 +1,25 @@
+#ifndef WRAPPER_NATIVE
+#define WRAPPER_NATIVE "skylake"
+#endif
+
+#ifndef WRAPPER_CC
+#define WRAPPER_CC "gcc"
+#endif
+
+#ifndef WRAPPER_AVX
+#define WRAPPER_AVX "AVX256"
+#endif
+
 PROGRAM compiler_wrapper
-    ! Wraps GCC invocations,
-    ! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions
-    ! with -D__AVX256__, and -march=native with -march=skylake,
+    ! Wraps C compiler invocations,
+    ! replacing -D__AVX512__, -D__AVX256__, and -D__SCALAR__ preprocessor definitions
+    ! with -D__<WRAPPER_AVX>__, and -march=native with -march=<WRAPPER_NATIVE>,
     ! for better reproducibility and compatibility.
     IMPLICIT NONE
     INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0
     CHARACTER(len=:), ALLOCATABLE :: arg, command
     ALLOCATE(CHARACTER(len=128) :: arg)
-    command = "gcc"
+    command = WRAPPER_CC
 
     DO i = 1, COMMAND_ARGUMENT_COUNT()
         DO
@@ -22,9 +34,15 @@ PROGRAM compiler_wrapper
             END IF
         END DO
         IF (arg == "-march=native") THEN
-            command = command // " '-march=skylake'"
-        ELSE IF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN
-            command = command // " '-D__AVX256__'"
+            command = command // (" '-march=" // WRAPPER_NATIVE // "'")
+        ELSE IF ( &
+            arg == "-D__AVX512__" &
+            .OR. arg == "-D__AVX256__" &
+            .OR. arg == "-D__SCALAR__" &
+        ) THEN
+#ifndef WRAPPER_NO_AVX
+            command = command // (" '-D__" // WRAPPER_AVX // "__'")
+#endif
         ELSE
             command = command // shell_escaped(arg)
         END IF