coreweave · wbrown · Feb 22, 2025 · Nov 4, 2024 · Nov 4, 2024 · Nov 5, 2024
diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml
@@ -1,6 +1,7 @@
-cuda: [ 12.6.1, 12.4.1, 12.2.2 ]
-os: [ ubuntu22.04, ubuntu20.04 ]
+cuda: [ 12.8.0, 12.6.3, 12.4.1 ]
+os: [ ubuntu22.04 ]
+abi: [ 1, 0 ]
 include:
-  - torch: 2.5.0
-    vision: 0.20.0
-    audio: 2.5.0
+  - torch: 2.6.0
+    vision: 0.21.0
+    audio: 2.6.0
diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
@@ -1,37 +1,9 @@
-image:
-  # Ubuntu 22.04
-  - cuda: 12.6.1
-    cudnn: cudnn
-    os: ubuntu22.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
-  - cuda: 12.4.1
-    cudnn: cudnn
-    os: ubuntu22.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
-  - cuda: 12.2.2
-    cudnn: cudnn8
-    os: ubuntu22.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
-  # Ubuntu 20.04
-  - cuda: 12.6.1
-    cudnn: cudnn
-    os: ubuntu20.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
-  - cuda: 12.4.1
-    cudnn: cudnn
-    os: ubuntu20.04
-    nccl: 2.23.4-1
-    nccl-tests-hash: 2ff05b2
-  - cuda: 12.2.2
-    cudnn: cudnn8
-    os: ubuntu20.04
-    nccl: 2.21.5-1
-    nccl-tests-hash: 2ff05b2
+cuda: [ 12.8.0, 12.6.3, 12.4.1 ]
+os: [ ubuntu22.04 ]
+abi: [ 1, 0 ]
 include:
-  - torch: 2.5.0
-    vision: 0.20.0
-    audio: 2.5.0
+  - torch: 2.6.0
+    vision: 0.21.0
+    audio: 2.6.0
+    nccl: 2.25.1-1
+    nccl-tests-hash: 57fa979
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -19,6 +19,11 @@ on:
         required: false
         description: "Optional sub-key to append to the image name for build layer caching"
         type: string
+      platforms:
+        required: false
+        description: "Platforms for which to build (default: linux/amd64,linux/arm64)"
+        type: string
+        default: linux/amd64,linux/arm64
     outputs:
       outcome:
         description: "The outcome of the build"
@@ -33,26 +38,42 @@ on:
 jobs:
   build:
     name: Build Images
-    runs-on: [ self-hosted, Linux ]
+    runs-on: [ cw ]
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0'
+    timeout-minutes: 960
+    defaults:
+      run:
+        shell: bash
     outputs:
       outcome: ${{ steps.docker-build.outcome }}
       tags: ${{ steps.meta.outputs.tags }}
       version: ${{ steps.meta.outputs.version }}
     steps:
-      - uses: actions/checkout@v3
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2.2.1
-      - name: Login to GitHub container registry
-        uses: docker/login-action@v2.2.0
+      - uses: actions/checkout@v4
+      - name: Fetch BuildKit Client Certs
+        uses: dopplerhq/secrets-fetch-action@v1.2.0
+        id: client-certs
         with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Login to DockerHub container registry
-        uses: docker/login-action@v2.2.0
+          doppler-token: ${{ secrets.ORG_BUILDKIT_CLIENT_TOKEN }}
+          doppler-project: ${{ secrets.BUILDKIT_CONSUMER_DOPPLER_PROJECT }}
+          doppler-config: prod
+          inject-env-vars: false
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3.7.1
         with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+          driver: remote
+          endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }}
+          platforms: linux/amd64
+          append: |
+            - endpoint: ${{ secrets.BUILDKIT_CONSUMER_ARM64_ENDPOINT }}
+              platforms: linux/arm64
+        env:
+          BUILDER_NODE_0_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
+          BUILDER_NODE_0_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
+          BUILDER_NODE_0_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
+          BUILDER_NODE_1_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }}
+          BUILDER_NODE_1_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }}
+          BUILDER_NODE_1_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }}
       - name: Get base registry
         run: |
           echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV
@@ -70,14 +91,21 @@ jobs:
           echo "CACHE_KEY=${{ inputs.image-name }}-${{ inputs.cache-key }}" >> $GITHUB_ENV
       - name: Extract metadata (tags, labels) for Docker
         id: meta
-        uses: docker/metadata-action@v4.1.1
+        uses: docker/metadata-action@v5.5.1
         with:
           images: ${{ env.REGISTRY }}/${{ inputs.image-name }}
           tags: |
             type=sha,prefix=${{ env.TAG_PREFIX }},suffix=${{ env.TAG_SUFFIX }},format=short
+      - name: Initialize registry credentials file
+        env:
+          USER: ${{ github.actor }}
+          PASS: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          jq -n '.auths."ghcr.io" = { username: env.USER, password: env.PASS }' \
+          | install -m400 /dev/stdin ~/.docker/config.json
       - name: Build and push Docker image
         id: docker-build
-        uses: docker/build-push-action@v3.2.0
+        uses: docker/build-push-action@v6.9.0
         with:
           context: ${{ inputs.folder }}
           build-args: |-
@@ -87,6 +115,11 @@ jobs:
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }}
           cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max
+          platforms: ${{ inputs.platforms }}
+      - name: Clear registry credentials
+        if: always()
+        run: |
+          rm -f ~/.docker/config.json && [ ! -e ~/.docker/config.json ]
       - uses: 8BitJonny/gh-get-current-pr@2.1.3
         id: PR
         with:

diff --git a/.github/workflows/read-configuration.yml b/.github/workflows/read-configuration.yml
@@ -17,12 +17,16 @@ on:
 jobs:
   read-file:
     name: Read Configuration File
-    runs-on: ["self-hosted", "Linux"]
+    runs-on: [ cw ]
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
+    defaults:
+      run:
+        shell: bash
     permissions: {}
     outputs:
       config: ${{ steps.read.outputs.contents }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Read configuration
         id: read
         env:

diff --git a/.github/workflows/sglang.yml b/.github/workflows/sglang.yml
@@ -0,0 +1,30 @@
+on:
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: 'Tag for the build'
+        required: true
+      base-image:
+        description: 'Base image from which to build'
+        required: true
+      builder-image:
+        description: 'Image to use to compile wheels, if different from the base image'
+        required: false
+  push:
+    paths:
+      - "sglang/**"
+      - ".github/workflows/sglang.yml"
+      - ".github/workflows/build.yml"
+
+
+jobs:
+  build:
+    uses: ./.github/workflows/build.yml
+    secrets: inherit
+    with:
+      image-name: sglang
+      folder: sglang
+      tag-suffix: ${{ inputs.tag || '386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1' }}
+      build-args: |
+        BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:es-actions-386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1'}}
+        ${{ inputs.base-image && 'BASE_IMAGE=' }}${{ inputs.base-image}}
diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml
@@ -35,11 +35,12 @@ jobs:
     secrets: inherit
     with:
       image-name: ${{ inputs.image-name }}
-      tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
+      tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }}
       builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-${{ matrix.os }}
       base-image: nvidia/cuda:${{ matrix.cuda }}-base-${{ matrix.os }}
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
       torchaudio-version: ${{ matrix.audio }}
+      cxx11-abi: ${{ matrix.abi }}
       cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml
@@ -51,13 +51,17 @@ jobs:
   get-required-bases:
     name: Get Latest Required Base Images
     if: inputs.skip-bases-check != true
-    runs-on: ["self-hosted", "Linux"]
+    runs-on: [ cw ]
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0'
+    defaults:
+      run:
+        shell: bash
     permissions:
       packages: read
     outputs:
       bases-list: ${{ steps.choose-bases.outputs.list }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Check if torch-extras needs to be rebuilt from previous bases

diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml
@@ -43,11 +43,12 @@ jobs:
     secrets: inherit
     with:
       image-name: ${{ inputs.image-name }}
-      tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.image.cuda, matrix.image.os, matrix.image.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
-      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
-      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
+      tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.cuda, matrix.os, matrix.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }}
+      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
+      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
       torch-version: ${{ matrix.torch }}
       torchvision-version: ${{ matrix.vision }}
       torchaudio-version: ${{ matrix.audio }}
-      cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }}
+      cxx11-abi: ${{ matrix.abi }}
+      cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
@@ -19,7 +19,11 @@ jobs:
   get-nightly-info:
     name:
       Get Nightly Info
-    runs-on: [ self-hosted, Linux ]
+    runs-on: [ cw ]
+    container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0'
+    defaults:
+      run:
+        shell: bash
     outputs:
       pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }}
       triton-commit: ${{ steps.get-hash.outputs.triton-commit }}
@@ -89,13 +93,13 @@ jobs:
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-base.yml
-      filter: del(.include)
+      filter: 'del(.include) | .exclude |= . + [{"os": "ubuntu20.04"}]'
   get-nccl-config:
     name: Get torch:nccl Config
     uses: ./.github/workflows/read-configuration.yml
     with:
       path: ./.github/configurations/torch-nccl.yml
-      filter: del(.include)
+      filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"os": "ubuntu20.04"}]'
 
   build-base:
     name: Build Nightly torch:base
@@ -130,12 +134,12 @@ jobs:
     secrets: inherit
     with:
       image-name: nightly-torch
-      tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.image.cuda, matrix.image.os, matrix.image.nccl, needs.get-nightly-info.outputs.version-string ) }}
-      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
-      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
+      tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.cuda, matrix.os, matrix.nccl, needs.get-nightly-info.outputs.version-string ) }}
+      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
+      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }}
       torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
       torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
       torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
       triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }}
-      cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }}
+      cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }}
       build-extras: true
diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml
@@ -22,10 +22,9 @@ on:
       triton-version:
         required: false
         type: string
-      cuda-arch-support:
+      cxx11-abi:
         required: false
         type: string
-        default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
       image-name:
         required: false
         type: string
@@ -67,11 +66,10 @@ on:
         required: false
         description: "Tagged version number from openai/triton to build"
         type: string
-      cuda-arch-support:
+      cxx11-abi:
         required: false
-        description: "Space-separated list of CUDA architectures to support"
+        description: "Build with the CXX11 ABI (1 = enable, 0 = disable)"
         type: string
-        default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
       image-name:
         required: false
         description: "Custom name under which to publish the resulting container"
@@ -99,7 +97,8 @@ jobs:
         BUILD_TORCH_VERSION=${{ inputs.torch-version }}
         BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }}
         BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }}
-        ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }}
+        BUILD_TORCH_CUDA_ARCH_LIST=7.0 7.5 8.0 8.6 8.9 9.0+PTX
+        ${{ inputs.cxx11-abi && format('BUILD_CXX11_ABI={0}', inputs.cxx11-abi) || '' }}
         ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }}
   build-extras:
     name: Build torch-extras

diff --git a/sglang/Dockerfile b/sglang/Dockerfile
@@ -0,0 +1,28 @@
+# syntax=docker/dockerfile:1.2
+ARG BASE_IMAGE
+ARG BUILDER_IMAGE="${BASE_IMAGE}"
+
+FROM ${BUILDER_IMAGE} AS builder
+
+ARG BUILD_TORCH_CUDA_ARCH_LIST='8.0 8.6 8.9 9.0 10.0+PTX'
+
+ARG FLASHINFER_COMMIT='c04755e21f4d6fb7813c703f2b00a7ef012be9b8'
+ARG CUTLASS_COMMIT='b78588d1630aa6643bf021613717bafb705df4ef'
+ARG VLLM_COMMIT='5095e966069b9e65b7c4c63427e06cebacaad0a0'
+ARG SGLANG_COMMIT='4b6f62e2bc52a528551e9a21e7b0a4945c6115bb'
+ARG DECORD_COMMIT='d2e56190286ae394032a8141885f76d5372bd44b'
+# Building Triton is not currently enabled,
+# but this is the commit that would be used if it were
+ARG TRITON_COMMIT='1e0e51c4aeb3e1beea000da5d0e494f8b9ac40dd'
+
+WORKDIR /build
+COPY build.bash /build/
+RUN mkdir /wheels && \
+    bash build.bash -a "${BUILD_TORCH_CUDA_ARCH_LIST}" && \
+    rm -rf /build/*
+COPY install.bash /wheels/
+
+FROM ${BASE_IMAGE}
+RUN --mount=type=bind,from=builder,source=/wheels,target=/wheels \
+    cd /wheels && \
+    bash install.bash