From 9a5990d38e2e1286c33ef7688e68a16af93c4905 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 4 Nov 2024 14:48:38 -0600 Subject: [PATCH 01/94] feat(torch): Update PyTorch to v2.5.1 & update CUDA 12.6 --- .github/configurations/torch-base.yml | 4 ++-- .github/configurations/torch-nccl.yml | 2 +- torch/Dockerfile | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml index 761f2b6b..4c94e304 100644 --- a/.github/configurations/torch-base.yml +++ b/.github/configurations/torch-base.yml @@ -1,6 +1,6 @@ -cuda: [ 12.6.1, 12.4.1, 12.2.2 ] +cuda: [ 12.6.2, 12.4.1, 12.2.2 ] os: [ ubuntu22.04, ubuntu20.04 ] include: - - torch: 2.5.0 + - torch: 2.5.1 vision: 0.20.0 audio: 2.5.0 diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 6bd5f029..84c743ed 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -32,6 +32,6 @@ image: nccl: 2.21.5-1 nccl-tests-hash: 2ff05b2 include: - - torch: 2.5.0 + - torch: 2.5.1 vision: 0.20.0 audio: 2.5.0 diff --git a/torch/Dockerfile b/torch/Dockerfile index 6705427d..24f7f1f2 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -2,7 +2,7 @@ ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.4.1-devel-ubuntu22.04" ARG FINAL_BASE_IMAGE="nvidia/cuda:12.4.1-base-ubuntu22.04" -ARG BUILD_TORCH_VERSION="2.5.0" +ARG BUILD_TORCH_VERSION="2.5.1" ARG BUILD_TORCH_VISION_VERSION="0.20.0" ARG BUILD_TORCH_AUDIO_VERSION="2.5.0" ARG BUILD_TRANSFORMERENGINE_VERSION="1.11" From eec6daabd4e7899216875b9cb53ee341697bafc5 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 4 Nov 2024 16:11:33 -0600 Subject: [PATCH 02/94] feat(torch): Update `torch:nccl` base images --- .github/configurations/torch-nccl.yml | 25 +++++-------------------- .github/workflows/torch-nccl.yml | 4 ++-- .github/workflows/torch-nightly.yml | 4 ++-- 3 files changed, 9 insertions(+), 24 deletions(-) diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 84c743ed..a34899b7 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,37 +1,22 @@ image: # Ubuntu 22.04 - - cuda: 12.6.1 - cudnn: cudnn + - cuda: 12.6.2 os: ubuntu22.04 - nccl: 2.23.4-1 - nccl-tests-hash: 2ff05b2 - cuda: 12.4.1 - cudnn: cudnn os: ubuntu22.04 - nccl: 2.23.4-1 - nccl-tests-hash: 2ff05b2 - cuda: 12.2.2 - cudnn: cudnn8 os: ubuntu22.04 - nccl: 2.23.4-1 - nccl-tests-hash: 2ff05b2 # Ubuntu 20.04 - - cuda: 12.6.1 - cudnn: cudnn + - cuda: 12.6.2 os: ubuntu20.04 - nccl: 2.23.4-1 - nccl-tests-hash: 2ff05b2 - cuda: 12.4.1 - cudnn: cudnn os: ubuntu20.04 - nccl: 2.23.4-1 - nccl-tests-hash: 2ff05b2 - cuda: 12.2.2 - cudnn: cudnn8 os: ubuntu20.04 - nccl: 2.21.5-1 - nccl-tests-hash: 2ff05b2 include: - torch: 2.5.1 vision: 0.20.0 audio: 2.5.0 + image: + nccl: 2.23.4-1 + nccl-tests-hash: c58f522 diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml index aee13052..0c78bc55 100644 --- a/.github/workflows/torch-nccl.yml +++ b/.github/workflows/torch-nccl.yml @@ -44,8 +44,8 @@ jobs: with: image-name: ${{ inputs.image-name }} tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.image.cuda, matrix.image.os, matrix.image.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }} - builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} - base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} + builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} + base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} torch-version: ${{ matrix.torch }} torchvision-version: ${{ matrix.vision }} torchaudio-version: ${{ matrix.audio }} diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 063e40af..d12e09de 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -131,8 +131,8 @@ jobs: with: image-name: nightly-torch tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.image.cuda, matrix.image.os, matrix.image.nccl, needs.get-nightly-info.outputs.version-string ) }} - builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} - base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} + builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} + base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }} torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} From e411a3d961f4f4487cb6793620d8db8c459d6f8e Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 5 Nov 2024 13:44:40 -0600 Subject: [PATCH 03/94] ci(torch): Change `torch:nccl` matrix build layout --- .github/configurations/torch-nccl.yml | 22 ++++------------------ .github/workflows/torch-nccl.yml | 8 ++++---- .github/workflows/torch-nightly.yml | 8 ++++---- 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index a34899b7..74da04cc 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,22 +1,8 @@ -image: - # Ubuntu 22.04 - - cuda: 12.6.2 - os: ubuntu22.04 - - cuda: 12.4.1 - os: ubuntu22.04 - - cuda: 12.2.2 - os: ubuntu22.04 - # Ubuntu 20.04 - - cuda: 12.6.2 - os: ubuntu20.04 - - cuda: 12.4.1 - os: ubuntu20.04 - - cuda: 12.2.2 - os: ubuntu20.04 +cuda: [ 12.6.2, 12.4.1, 12.2.2 ] +os: [ ubuntu22.04, ubuntu20.04 ] include: - torch: 2.5.1 vision: 0.20.0 audio: 2.5.0 - image: - nccl: 2.23.4-1 - nccl-tests-hash: c58f522 + nccl: 2.23.4-1 + nccl-tests-hash: c58f522 diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml index 0c78bc55..9815639d 100644 --- a/.github/workflows/torch-nccl.yml +++ b/.github/workflows/torch-nccl.yml @@ -43,11 +43,11 @@ jobs: secrets: inherit with: image-name: ${{ inputs.image-name }} - tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.image.cuda, matrix.image.os, matrix.image.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }} - builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} - base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} + tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.cuda, matrix.os, matrix.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }} + builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} + base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} torch-version: ${{ matrix.torch }} torchvision-version: ${{ matrix.vision }} torchaudio-version: ${{ matrix.audio }} - cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }} + cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }} build-extras: true diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index d12e09de..bdf4a1b8 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -130,12 +130,12 @@ jobs: secrets: inherit with: image-name: nightly-torch - tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.image.cuda, matrix.image.os, matrix.image.nccl, needs.get-nightly-info.outputs.version-string ) }} - builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} - base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} + tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.cuda, matrix.os, matrix.nccl, needs.get-nightly-info.outputs.version-string ) }} + builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} + base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }} torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }} - cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }} + cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }} build-extras: true From 86120387a3dcaff6439ac0b808137a286a3efd8b Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 6 Nov 2024 16:11:59 -0600 Subject: [PATCH 04/94] ci: Update action versions --- .github/workflows/build.yml | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index cbb09fdc..7301747e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -39,20 +39,15 @@ jobs: tags: ${{ steps.meta.outputs.tags }} version: ${{ steps.meta.outputs.version }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2.2.1 + uses: docker/setup-buildx-action@v3.7.1 - name: Login to GitHub container registry uses: docker/login-action@v2.2.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Login to DockerHub container registry - uses: docker/login-action@v2.2.0 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - name: Get base registry run: | echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV @@ -70,14 +65,14 @@ jobs: echo "CACHE_KEY=${{ inputs.image-name }}-${{ inputs.cache-key }}" >> $GITHUB_ENV - name: Extract metadata (tags, labels) for Docker id: meta - uses: docker/metadata-action@v4.1.1 + uses: docker/metadata-action@v5.5.1 with: images: ${{ env.REGISTRY }}/${{ inputs.image-name }} tags: | type=sha,prefix=${{ env.TAG_PREFIX }},suffix=${{ env.TAG_SUFFIX }},format=short - name: Build and push Docker image id: docker-build - uses: docker/build-push-action@v3.2.0 + uses: docker/build-push-action@v6.9.0 with: context: ${{ inputs.folder }} build-args: |- @@ -87,6 +82,7 @@ jobs: labels: ${{ steps.meta.outputs.labels }} cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }} cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max + platforms: linux/amd64 - uses: 8BitJonny/gh-get-current-pr@2.1.3 id: PR with: From f58c927d7907931cf00d7a618d4a54e9f36cd4e8 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 6 Nov 2024 16:31:59 -0600 Subject: [PATCH 05/94] ci: Use remote BuildKit worker & new runners --- .github/workflows/build.yml | 38 +++++++++++++++++++----- .github/workflows/read-configuration.yml | 8 +++-- .github/workflows/torch-extras.yml | 8 +++-- .github/workflows/torch-nightly.yml | 6 +++- 4 files changed, 48 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7301747e..065233e3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -33,21 +33,34 @@ on: jobs: build: name: Build Images - runs-on: [ self-hosted, Linux ] + runs-on: [ cw ] + container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0' + defaults: + run: + shell: bash outputs: outcome: ${{ steps.docker-build.outcome }} tags: ${{ steps.meta.outputs.tags }} version: ${{ steps.meta.outputs.version }} steps: - uses: actions/checkout@v4 + - name: Fetch BuildKit Client Certs + uses: dopplerhq/secrets-fetch-action@v1.2.0 + id: client-certs + with: + doppler-token: ${{ secrets.ORG_BUILDKIT_CLIENT_TOKEN }} + doppler-project: ${{ secrets.BUILDKIT_CONSUMER_DOPPLER_PROJECT }} + doppler-config: prod + inject-env-vars: false - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3.7.1 - - name: Login to GitHub container registry - uses: docker/login-action@v2.2.0 with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} + driver: remote + endpoint: ${{ secrets.BUILDKIT_CONSUMER_ENDPOINT }} + env: + BUILDER_NODE_0_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }} + BUILDER_NODE_0_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }} + BUILDER_NODE_0_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }} - name: Get base registry run: | echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV @@ -70,6 +83,13 @@ jobs: images: ${{ env.REGISTRY }}/${{ inputs.image-name }} tags: | type=sha,prefix=${{ env.TAG_PREFIX }},suffix=${{ env.TAG_SUFFIX }},format=short + - name: Initialize registry credentials file + env: + USER: ${{ github.actor }} + PASS: ${{ secrets.GITHUB_TOKEN }} + run: | + jq -n '.auths."ghcr.io" = { username: env.USER, password: env.PASS }' \ + | install -m400 /dev/stdin ~/.docker/config.json - name: Build and push Docker image id: docker-build uses: docker/build-push-action@v6.9.0 @@ -77,12 +97,16 @@ jobs: context: ${{ inputs.folder }} build-args: |- ${{ inputs.build-args }} - push: true + push: false tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }} cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max platforms: linux/amd64 + - name: Clear registry credentials + if: always() + run: | + rm -f ~/.docker/config.json && [ ! -e ~/.docker/config.json ] - uses: 8BitJonny/gh-get-current-pr@2.1.3 id: PR with: diff --git a/.github/workflows/read-configuration.yml b/.github/workflows/read-configuration.yml index 12a21b31..25f5de3e 100644 --- a/.github/workflows/read-configuration.yml +++ b/.github/workflows/read-configuration.yml @@ -17,12 +17,16 @@ on: jobs: read-file: name: Read Configuration File - runs-on: ["self-hosted", "Linux"] + runs-on: [ cw ] + container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0' + defaults: + run: + shell: bash permissions: {} outputs: config: ${{ steps.read.outputs.contents }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Read configuration id: read env: diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index ca7134ed..e37a6c18 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -51,13 +51,17 @@ jobs: get-required-bases: name: Get Latest Required Base Images if: inputs.skip-bases-check != true - runs-on: ["self-hosted", "Linux"] + runs-on: [ cw ] + container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0' + defaults: + run: + shell: bash permissions: packages: read outputs: bases-list: ${{ steps.choose-bases.outputs.list }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Check if torch-extras needs to be rebuilt from previous bases diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index bdf4a1b8..4c3a4b30 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -19,7 +19,11 @@ jobs: get-nightly-info: name: Get Nightly Info - runs-on: [ self-hosted, Linux ] + runs-on: [ cw ] + container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0' + defaults: + run: + shell: bash outputs: pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }} triton-commit: ${{ steps.get-hash.outputs.triton-commit }} From a3f1d783c80a6cec0964744e9ae162ce7a1fea2b Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 6 Nov 2024 17:00:19 -0600 Subject: [PATCH 06/94] ci(torch-nightly): Only filter specific fields from configs' `include` --- .github/workflows/torch-nightly.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 4c3a4b30..5841531c 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -93,13 +93,13 @@ jobs: uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-base.yml - filter: del(.include) + filter: '.include = ( .include | del(.torch, .vision, .audio) )' get-nccl-config: name: Get torch:nccl Config uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-nccl.yml - filter: del(.include) + filter: '.include = ( .include | del(.torch, .vision, .audio) )' build-base: name: Build Nightly torch:base From e6bb688b043f0079cb8a3db750c07d46cfd7216b Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 6 Nov 2024 17:07:55 -0600 Subject: [PATCH 07/94] ci(torch-nightly): Change `del()` syntax in `yq` filter --- .github/workflows/torch-nightly.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 5841531c..3ec7de14 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -93,13 +93,13 @@ jobs: uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-base.yml - filter: '.include = ( .include | del(.torch, .vision, .audio) )' + filter: 'del( .include | ( .torch, .vision, .audio ) )' get-nccl-config: name: Get torch:nccl Config uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-nccl.yml - filter: '.include = ( .include | del(.torch, .vision, .audio) )' + filter: 'del( .include | ( .torch, .vision, .audio ) )' build-base: name: Build Nightly torch:base From 4fe66e299e3e6107ef0a6ebeee93812232626a8d Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 6 Nov 2024 17:11:23 -0600 Subject: [PATCH 08/94] ci(torch-nightly): Treat `include` as an array in `yq` filter --- .github/workflows/torch-nightly.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 3ec7de14..25f8be2a 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -93,13 +93,13 @@ jobs: uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-base.yml - filter: 'del( .include | ( .torch, .vision, .audio ) )' + filter: 'del( .include[] | ( .torch, .vision, .audio ) )' get-nccl-config: name: Get torch:nccl Config uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-nccl.yml - filter: 'del( .include | ( .torch, .vision, .audio ) )' + filter: 'del( .include[] | ( .torch, .vision, .audio ) )' build-base: name: Build Nightly torch:base From 1c31940b9ef6027c315c8d4114bfb49a55908803 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 6 Nov 2024 17:44:13 -0600 Subject: [PATCH 09/94] ci(torch-nightly): Exclude `ubuntu20.04` from `torch-nightly` builds --- .github/workflows/torch-nightly.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 25f8be2a..8d700747 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -93,13 +93,13 @@ jobs: uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-base.yml - filter: 'del( .include[] | ( .torch, .vision, .audio ) )' + filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"os": "ubuntu20.04"}]' get-nccl-config: name: Get torch:nccl Config uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-nccl.yml - filter: 'del( .include[] | ( .torch, .vision, .audio ) )' + filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"os": "ubuntu20.04"}]' build-base: name: Build Nightly torch:base From 092a837e17189302b03c46cd6e9d592d012764af Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 6 Nov 2024 17:49:37 -0600 Subject: [PATCH 10/94] ci(torch-nightly): Filter out entire `include` key for `torch:base` --- .github/workflows/torch-nightly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 8d700747..fb51869f 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -93,7 +93,7 @@ jobs: uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-base.yml - filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"os": "ubuntu20.04"}]' + filter: 'del(.include) | .exclude |= . + [{"os": "ubuntu20.04"}]' get-nccl-config: name: Get torch:nccl Config uses: ./.github/workflows/read-configuration.yml From 3448307494403ff9915af7f78ce93fa1a40d276b Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 6 Nov 2024 23:03:21 -0600 Subject: [PATCH 11/94] ci: Build for multiple architectures [skip ci] --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 065233e3..3f770eff 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -97,12 +97,12 @@ jobs: context: ${{ inputs.folder }} build-args: |- ${{ inputs.build-args }} - push: false + push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }} cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max - platforms: linux/amd64 + platforms: linux/amd64,linux/arm64 - name: Clear registry credentials if: always() run: | From 3b6b17d32646bdd75bd29bb7dd67f9865c7fe538 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 6 Nov 2024 23:08:28 -0600 Subject: [PATCH 12/94] ci: Build only for `linux/amd64` [skip ci] --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3f770eff..79191ce7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -102,7 +102,7 @@ jobs: labels: ${{ steps.meta.outputs.labels }} cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }} cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max - platforms: linux/amd64,linux/arm64 + platforms: linux/amd64 - name: Clear registry credentials if: always() run: | From 35c959b9675fbbb78543e3f7553e6c1a1e554b03 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 13 Nov 2024 13:35:01 -0600 Subject: [PATCH 13/94] fix(torch): Include a post-v2.5.1 bugfix patch when building PyTorch --- torch/Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torch/Dockerfile b/torch/Dockerfile index 24f7f1f2..a5bb6f65 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -46,7 +46,12 @@ RUN chmod 755 /git/clone.sh FROM downloader-base as pytorch-downloader ARG BUILD_TORCH_VERSION +# Includes a patch for a foreach bug in PyTorch v2.5.1 RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \ + if [ "${BUILD_TORCH_VERSION}" = '2.5.1' ]; then \ + wget 'https://github.com/pytorch/pytorch/commit/1cdaf1d85f5e4b3f8952fd0737a1afeb16995d13.patch' -qO- \ + | git -C pytorch apply; \ + fi && \ rm -rf pytorch/.git FROM downloader-base as torchvision-downloader From cdafd6bc63de5c0c4914e9eb501db4dd137464cc Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 03:04:46 -0600 Subject: [PATCH 14/94] feat(torch): Parameterize `compiler_wrapper.f95` [skip ci] --- torch-extras/compiler_wrapper.f95 | 32 ++++++++++++++++++++++++------- torch/compiler_wrapper.f95 | 32 ++++++++++++++++++++++++------- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/torch-extras/compiler_wrapper.f95 b/torch-extras/compiler_wrapper.f95 index f8c13bd2..cbdc602e 100644 --- a/torch-extras/compiler_wrapper.f95 +++ b/torch-extras/compiler_wrapper.f95 @@ -1,13 +1,25 @@ +#ifndef WRAPPER_NATIVE +#define WRAPPER_NATIVE "skylake" +#endif + +#ifndef WRAPPER_CC +#define WRAPPER_CC "gcc" +#endif + +#ifndef WRAPPER_AVX +#define WRAPPER_AVX "AVX256" +#endif + PROGRAM compiler_wrapper - ! Wraps GCC invocations, - ! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions - ! with -D__AVX256__, and -march=native with -march=skylake, + ! Wraps C compiler invocations, + ! replacing -D__AVX512__, -D__AVX256__, and -D__SCALAR__ preprocessor definitions + ! with -D____, and -march=native with -march=, ! for better reproducibility and compatibility. IMPLICIT NONE INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0 CHARACTER(len=:), ALLOCATABLE :: arg, command ALLOCATE(CHARACTER(len=128) :: arg) - command = "gcc" + command = WRAPPER_CC DO i = 1, COMMAND_ARGUMENT_COUNT() DO @@ -22,9 +34,15 @@ PROGRAM compiler_wrapper END IF END DO IF (arg == "-march=native") THEN - command = command // " '-march=skylake'" - ELSE IF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN - command = command // " '-D__AVX256__'" + command = command // (" '-march=" // WRAPPER_NATIVE // "'") + ELSE IF ( & + arg == "-D__AVX512__" & + .OR. arg == "-D__AVX256__" & + .OR. arg == "-D__SCALAR__" & + ) THEN +#ifndef WRAPPER_NO_AVX + command = command // (" '-D__" // WRAPPER_AVX // "__'") +#endif ELSE command = command // shell_escaped(arg) END IF diff --git a/torch/compiler_wrapper.f95 b/torch/compiler_wrapper.f95 index f8c13bd2..cbdc602e 100644 --- a/torch/compiler_wrapper.f95 +++ b/torch/compiler_wrapper.f95 @@ -1,13 +1,25 @@ +#ifndef WRAPPER_NATIVE +#define WRAPPER_NATIVE "skylake" +#endif + +#ifndef WRAPPER_CC +#define WRAPPER_CC "gcc" +#endif + +#ifndef WRAPPER_AVX +#define WRAPPER_AVX "AVX256" +#endif + PROGRAM compiler_wrapper - ! Wraps GCC invocations, - ! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions - ! with -D__AVX256__, and -march=native with -march=skylake, + ! Wraps C compiler invocations, + ! replacing -D__AVX512__, -D__AVX256__, and -D__SCALAR__ preprocessor definitions + ! with -D____, and -march=native with -march=, ! for better reproducibility and compatibility. IMPLICIT NONE INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0 CHARACTER(len=:), ALLOCATABLE :: arg, command ALLOCATE(CHARACTER(len=128) :: arg) - command = "gcc" + command = WRAPPER_CC DO i = 1, COMMAND_ARGUMENT_COUNT() DO @@ -22,9 +34,15 @@ PROGRAM compiler_wrapper END IF END DO IF (arg == "-march=native") THEN - command = command // " '-march=skylake'" - ELSE IF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN - command = command // " '-D__AVX256__'" + command = command // (" '-march=" // WRAPPER_NATIVE // "'") + ELSE IF ( & + arg == "-D__AVX512__" & + .OR. arg == "-D__AVX256__" & + .OR. arg == "-D__SCALAR__" & + ) THEN +#ifndef WRAPPER_NO_AVX + command = command // (" '-D__" // WRAPPER_AVX // "__'") +#endif ELSE command = command // shell_escaped(arg) END IF From e0bd93a07629cb2c6e01627bfd38eb4b9e2c94aa Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 03:07:15 -0600 Subject: [PATCH 15/94] fix(torch): Enable preprocessor when compiling `compiler_wrapper.f95` [skip ci] --- torch-extras/Dockerfile | 2 +- torch/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 3c785f0f..b67aea1c 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -80,7 +80,7 @@ WORKDIR /build # The compiler wrapper normalizes -march=native to -march=skylake # along with a couple other transformations before invoking GCC. COPY compiler_wrapper.f95 . -RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 +RUN gfortran -cpp -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 COPY --chmod=755 effective_cpu_count.sh . COPY --chmod=755 scale.sh . diff --git a/torch/Dockerfile b/torch/Dockerfile index a5bb6f65..185ecee4 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -241,7 +241,7 @@ WORKDIR /build COPY --chmod=755 effective_cpu_count.sh . COPY --chmod=755 scale.sh . COPY compiler_wrapper.f95 . -RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 +RUN gfortran -cpp -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 COPY <<-"EOT" /build/version-string.sh #!/bin/sh From 074edd5d82ea0e672122ff4dd29bc1b6559bf452 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 03:34:54 -0600 Subject: [PATCH 16/94] build(torch): Make the build process less architecture-dependent [skip ci] --- torch/Dockerfile | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 185ecee4..dfb99727 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -147,7 +147,7 @@ RUN apt-get -qq update && apt-get -qq install -y \ /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ - ln -s libomp.so.5 /usr/lib/x86_64-linux-gnu/libomp.so && \ + ln -s libomp.so.5 "/usr/lib/$(gcc -print-multiarch)/libomp.so" && \ ldconfig COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh @@ -167,10 +167,14 @@ RUN export \ RUN mkdir /tmp/ccache-install && \ cd /tmp/ccache-install && \ - CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2-linux-x86_64.tar.xz' && \ - wget -qO - $CCACHE_URL | tar --strip-components 1 -xJf - && \ + CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2.tar.xz' && \ + wget -qO - "$CCACHE_URL" | tar --strip-components 1 -xJf - && \ + mkdir build && \ + cd build && \ + cmake -B. -S.. -DCMAKE_BUILD_TYPE=Release && \ + cmake --build . --config Release && \ make install && \ - cd .. && \ + cd ../.. && \ rm -rf /tmp/ccache-install && \ ccache -M "${BUILD_CCACHE_SIZE}" && \ ccache -F 0 @@ -501,7 +505,7 @@ RUN apt-get -qq update && apt-get -qq install -y \ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ update-alternatives --install /usr/bin/vim vim /usr/bin/vim.tiny 1 && \ - ln -s libomp.so.5 /usr/lib/x86_64-linux-gnu/libomp.so && \ + ln -s libomp.so.5 "/usr/lib/$(gcc -print-multiarch)/libomp.so" && \ ldconfig RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \ From dc97cd6a45d2c4ee1a541d0f9346c72b0311b941 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 03:36:25 -0600 Subject: [PATCH 17/94] fix(torch): Use `cmake` after installing it instead of before [skip ci] --- torch/Dockerfile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index dfb99727..2b7c04df 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -165,6 +165,14 @@ RUN export \ rm /tmp/install_cudnn.sh && \ apt-get clean +# Add Kitware's apt repository to get a newer version of CMake +RUN apt-get -qq update && apt-get -qq install -y \ + software-properties-common lsb-release && \ + { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \ + | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \ + apt-add-repository -n "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ + apt-get -qq update && apt-get -qq install -y cmake && apt-get clean + RUN mkdir /tmp/ccache-install && \ cd /tmp/ccache-install && \ CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2.tar.xz' && \ @@ -185,14 +193,6 @@ ENV CCACHE_DIR=/ccache \ CMAKE_CXX_COMPILER_LAUNCHER=ccache \ CMAKE_CUDA_COMPILER_LAUNCHER=ccache -# Add Kitware's apt repository to get a newer version of CMake -RUN apt-get -qq update && apt-get -qq install -y \ - software-properties-common lsb-release && \ - { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \ - | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \ - apt-add-repository -n "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ - apt-get -qq update && apt-get -qq install -y cmake && apt-get clean - # Update compiler (GCC) and linker (LLD) versions RUN CODENAME="$(lsb_release -cs)" && \ wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ From 85f3b0ea38eae8a5463ff7f1880f872036e8d25e Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 03:44:42 -0600 Subject: [PATCH 18/94] build(torch): Allow customizing `-march` with `--build-arg`s [skip ci] --- torch/Dockerfile | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 2b7c04df..d8090574 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -245,7 +245,16 @@ WORKDIR /build COPY --chmod=755 effective_cpu_count.sh . COPY --chmod=755 scale.sh . COPY compiler_wrapper.f95 . -RUN gfortran -cpp -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 +ARG AMD64_NATIVE_ARCH="skylake" +ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres" +RUN if [ "$(uname -m)" = "aarch64" ]; then \ + NATIVE="${ARM64_NATIVE_ARCH}" && \ + AVX='WRAPPER_NO_AVX'; \ + else \ + NATIVE="${AMD64_NATIVE_ARCH}" && \ + AVX='WRAPPER_AVX=AVX256'; \ + fi && \ + gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 COPY <<-"EOT" /build/version-string.sh #!/bin/sh From 68121aa18427b4259b92ab2509545a92de1805c1 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 03:53:28 -0600 Subject: [PATCH 19/94] build(torch): Allow customizing `MAX_JOBS` as a build arg [skip ci] --- torch/Dockerfile | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index d8090574..c2767668 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -338,9 +338,10 @@ ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271). # Without WITH_BLAS, it would detect the BLAS implementation as # BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either. +ARG BUILD_MAX_JOBS="" RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \ --mount=type=cache,target=/ccache \ - export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ ./storage-info.sh . && \ cd pytorch && \ @@ -388,7 +389,8 @@ RUN pip3 install --no-cache-dir --upgrade \ RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=vision/,rw \ --mount=type=cache,target=/ccache \ - export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ + echo "MAX_JOBS: ${MAX_JOBS}" && \ cd vision && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ @@ -425,7 +427,8 @@ RUN pip3 install --no-cache-dir --upgrade \ RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/,rw \ --mount=type=cache,target=/ccache \ - export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ + echo "MAX_JOBS: ${MAX_JOBS}" && \ cd audio && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ @@ -457,6 +460,8 @@ RUN rm ./dist/* # Build TransformerEngine RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerEngine,target=TransformerEngine/,rw \ export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-MAX_JOBS}" && \ + echo "MAX_JOBS: ${MAX_JOBS}" && \ cd TransformerEngine && \ if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \ sed -i "s/from functools import cache/from functools import lru_cache as cache/g" \ @@ -470,9 +475,10 @@ RUN rm ./dist/* SHELL ["/bin/bash", "-o", "pipefail", "-c"] RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \ export CC=$(realpath -e ./compiler) \ - MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)" \ + MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)}" \ PYTHONUNBUFFERED=1 \ FLASH_ATTENTION_FORCE_BUILD='TRUE' && \ + echo "MAX_JOBS: ${MAX_JOBS}" && \ cd flash-attention && \ ( \ for EXT_DIR in $(realpath -s -e \ From d35022bf5045924a507234c8cb96504716ab18d6 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 04:07:33 -0600 Subject: [PATCH 20/94] build(torch): Don't apply custom `MAX_JOBS` to `flash-attn` build [skip ci] --- torch/Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index c2767668..bdc2d747 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -370,7 +370,8 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch PYTORCH_BUILD_VERSION="$(../version-string.sh "$TORCH_VERSION")" \ PYTORCH_BUILD_NUMBER=0 \ TORCH_NVCC_FLAGS="-Xfatbin -compress-all -diag-suppress 191,186,177" \ - python3 setup.py bdist_wheel --dist-dir ../dist + python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \ + | grep -v '^ptxas /tmp/' RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177" @@ -475,7 +476,7 @@ RUN rm ./dist/* SHELL ["/bin/bash", "-o", "pipefail", "-c"] RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \ export CC=$(realpath -e ./compiler) \ - MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)}" \ + MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)" \ PYTHONUNBUFFERED=1 \ FLASH_ATTENTION_FORCE_BUILD='TRUE' && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ From 8305f7fb306e6271f0302a0ab9f83300754c487b Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 04:10:20 -0600 Subject: [PATCH 21/94] build(torch): Line-buffer `grep` output when building PyTorch [skip ci] --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index bdc2d747..251673e9 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -371,7 +371,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch PYTORCH_BUILD_NUMBER=0 \ TORCH_NVCC_FLAGS="-Xfatbin -compress-all -diag-suppress 191,186,177" \ python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \ - | grep -v '^ptxas /tmp/' + | grep -v --line-buffered '^ptxas /tmp/' RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177" From daa5a4b9cc9ca1f23d9652c3c6573db0956595db Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 04:13:27 -0600 Subject: [PATCH 22/94] build(torch): Filter more output when building PyTorch [skip ci] --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 251673e9..5cf4792f 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -371,7 +371,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch PYTORCH_BUILD_NUMBER=0 \ TORCH_NVCC_FLAGS="-Xfatbin -compress-all -diag-suppress 191,186,177" \ python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \ - | grep -v --line-buffered '^ptxas /tmp/' + | grep -Ev --line-buffered '^(ptxas /tmp/|copying torch/|creating build/)' RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177" From c940ad4c298bbb553dd27a021e74e2d59ee7ab3e Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 04:27:20 -0600 Subject: [PATCH 23/94] build(torch): Allow customizing TransformerEngine build arches --- torch/Dockerfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/torch/Dockerfile b/torch/Dockerfile index 5cf4792f..857b5234 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -10,6 +10,7 @@ ARG BUILD_FLASH_ATTN_VERSION="2.6.3" ARG BUILD_TRITON_VERSION="" ARG BUILD_TRITON="1" ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX" +ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST="70;80;89;90" # 8.7 is supported in the PyTorch main branch, but not 2.0.0 @@ -459,6 +460,9 @@ FROM builder-base as transformerengine-builder RUN rm ./dist/* # Build TransformerEngine +ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST +ENV NVTE_CUDA_ARCHS=$BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST + RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerEngine,target=TransformerEngine/,rw \ export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \ export MAX_JOBS="${BUILD_MAX_JOBS:-MAX_JOBS}" && \ From 863fca5680066de747d9f8f0a706137006d48f14 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 04:28:49 -0600 Subject: [PATCH 24/94] feat(torch): Add `flash-attn` 3 beta [skip ci] --- torch/Dockerfile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/torch/Dockerfile b/torch/Dockerfile index 857b5234..ca2648a2 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -7,6 +7,7 @@ ARG BUILD_TORCH_VISION_VERSION="0.20.0" ARG BUILD_TORCH_AUDIO_VERSION="2.5.0" ARG BUILD_TRANSFORMERENGINE_VERSION="1.11" ARG BUILD_FLASH_ATTN_VERSION="2.6.3" +ARG BUILD_FLASH_ATTN_3="1" ARG BUILD_TRITON_VERSION="" ARG BUILD_TRITON="1" ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX" @@ -477,6 +478,7 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE FROM builder-base as flash-attn-builder RUN rm ./dist/* +ARG BUILD_FLASH_ATTN_3 SHELL ["/bin/bash", "-o", "pipefail", "-c"] RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \ export CC=$(realpath -e ./compiler) \ @@ -485,9 +487,15 @@ RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,tar FLASH_ATTENTION_FORCE_BUILD='TRUE' && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ cd flash-attention && \ + if [ "$BUILD_FLASH_ATTN_3" = 1 ]; then \ + FA3_DIR="hopper"; \ + else \ + FA3_DIR=""; \ + fi && \ ( \ for EXT_DIR in $(realpath -s -e \ . \ + $FA3_DIR \ csrc/ft_attention \ csrc/fused_dense_lib \ csrc/fused_softmax \ From b4ce2dac58298bc73ca34bf1277848729c08e4a2 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 06:08:27 -0600 Subject: [PATCH 25/94] build(torch): Filter even more output when building PyTorch --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index ca2648a2..73cbb18a 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -373,7 +373,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch PYTORCH_BUILD_NUMBER=0 \ TORCH_NVCC_FLAGS="-Xfatbin -compress-all -diag-suppress 191,186,177" \ python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \ - | grep -Ev --line-buffered '^(ptxas /tmp/|copying torch/|creating build/)' + | grep -Ev --line-buffered '^(ptxas /tmp/|copying .+/|creating build/)' RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177" From 87c22ff437c7731daae21e8344207f6b6ccc32a0 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 06:09:11 -0600 Subject: [PATCH 26/94] build(torch-extras): Configure `compiler_wrapper.f95` parameters --- torch-extras/Dockerfile | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index b67aea1c..456420f3 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -80,7 +80,16 @@ WORKDIR /build # The compiler wrapper normalizes -march=native to -march=skylake # along with a couple other transformations before invoking GCC. COPY compiler_wrapper.f95 . -RUN gfortran -cpp -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 +ARG AMD64_NATIVE_ARCH="skylake" +ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres" +RUN if [ "$(uname -m)" = "aarch64" ]; then \ + NATIVE="${ARM64_NATIVE_ARCH}" && \ + AVX='WRAPPER_NO_AVX'; \ + else \ + NATIVE="${AMD64_NATIVE_ARCH}" && \ + AVX='WRAPPER_AVX=AVX256'; \ + fi && \ + gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 COPY --chmod=755 effective_cpu_count.sh . COPY --chmod=755 scale.sh . From 9cb44f4b075cdf7f6d625cb30d5019749e9713fd Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 06:11:10 -0600 Subject: [PATCH 27/94] build(torch-extras): Allow overriding `MAX_JOBS` and `NVCC_APPEND_FLAGS` [skip ci] --- torch-extras/Dockerfile | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 456420f3..88b4029d 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -4,6 +4,7 @@ ARG BASE_IMAGE ARG DEEPSPEED_VERSION="0.14.4" ARG APEX_COMMIT="23c1f86520e22b505e8fdfcf6298273dff2d93d8" ARG XFORMERS_VERSION="0.0.28.post1" +ARG BUILD_MAX_JOBS="" FROM alpine/git:2.36.3 as apex-downloader WORKDIR /git @@ -94,6 +95,10 @@ RUN if [ "$(uname -m)" = "aarch64" ]; then \ COPY --chmod=755 effective_cpu_count.sh . COPY --chmod=755 scale.sh . +ARG BUILD_NVCC_APPEND_FLAGS="" +ENV NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS:$BUILD_NVCC_APPEND_FLAGS }-diag-suppress 186,177" +ARG BUILD_MAX_JOBS + FROM builder-base as deepspeed-builder # DeepSpeed build flags @@ -153,7 +158,7 @@ RUN python3 -m pip install -U --no-cache-dir \ do if [[ -z ${!VAR} ]]; then unset ${VAR}; fi; done; \ } && \ CC=$(realpath -e ./compiler) \ - MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)" \ + MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)}" \ python3 -m pip wheel -w /wheels \ --no-cache-dir --no-build-isolation --no-deps \ deepspeed==${DEEPSPEED_VERSION} && \ @@ -187,8 +192,7 @@ RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \ :; \ )" && \ export CC=$(realpath -e ./compiler) && \ - export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 24)" && \ - export NVCC_APPEND_FLAGS='-diag-suppress 186,177' && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 24)}" && \ printf -- '--config-settings="--build-option=%s" ' $( \ echo \ --cpp_ext \ @@ -232,7 +236,6 @@ RUN python3 -m pip install -U --no-cache-dir \ CC=$(realpath -e ./compiler) \ MAX_JOBS=1 \ PYTHONUNBUFFERED=1 \ - NVCC_APPEND_FLAGS='-diag-suppress 186,177' \ XFORMERS_DISABLE_FLASH_ATTN=1 \ python3 -m pip wheel -w /wheels -v \ --no-cache-dir --no-build-isolation --no-deps \ From 4a6f2a769ec1c82b044e3463a9d65cbe1b1ada95 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 06:13:18 -0600 Subject: [PATCH 28/94] fix(torch-extras): Add missing `+` in parameter expansion [skip ci] --- torch-extras/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 88b4029d..96866ddd 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -96,7 +96,7 @@ COPY --chmod=755 effective_cpu_count.sh . COPY --chmod=755 scale.sh . ARG BUILD_NVCC_APPEND_FLAGS="" -ENV NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS:$BUILD_NVCC_APPEND_FLAGS }-diag-suppress 186,177" +ENV NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS:+$BUILD_NVCC_APPEND_FLAGS }-diag-suppress 186,177" ARG BUILD_MAX_JOBS From 1dfcbc118cd2b51a46600114915daf553313e9ad Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 06:23:27 -0600 Subject: [PATCH 29/94] feat(torch-extras): Build DeepSpeed-Kernels [skip ci] --- torch-extras/Dockerfile | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 96866ddd..d7af221c 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -3,6 +3,7 @@ ARG BASE_IMAGE ARG DEEPSPEED_VERSION="0.14.4" ARG APEX_COMMIT="23c1f86520e22b505e8fdfcf6298273dff2d93d8" +ARG DEEPSPEED_KERNELS_COMMIT="e77acc40b104696d4e73229b787d1ef29a9685b1" ARG XFORMERS_VERSION="0.0.28.post1" ARG BUILD_MAX_JOBS="" @@ -17,6 +18,18 @@ RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \ --depth 1 --filter=blob:none && \ find -type d -name docs -prune -exec rm -r '{}' ';' + +FROM alpine/git:2.36.3 as ds-kernels-downloader +WORKDIR /git +ARG DEEPSPEED_KERNELS_COMMIT +RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \ + https://github.com/microsoft/DeepSpeed-Kernels ds-kernels && \ + cd ds-kernels && \ + git checkout "${DEEPSPEED_KERNELS_COMMIT}" && \ + git submodule update --init --recursive --jobs 8 \ + --depth 1 --filter=blob:none + + # Dependencies requiring NVCC are built ahead of time in a separate stage # so that the ~2 GiB dev library installations don't have to be included # in the final image. @@ -101,6 +114,15 @@ ARG BUILD_MAX_JOBS FROM builder-base as deepspeed-builder + +RUN --mount=type=bind,from=ds-kernels-downloader,source=/git/ds-kernels,target=ds-kernels/,rw \ + cd ds-kernels && \ + export CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" && \ + echo "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}" && \ + python3 -m pip wheel -w /wheels \ + --no-cache-dir --no-build-isolation --no-deps . && \ + python3 -m pip install /wheels/*.whl + # DeepSpeed build flags # See: https://www.deepspeed.ai/tutorials/advanced-install ARG DS_BUILD_OPS="1" From 8a44533fa96c4713b70d726eddd0901c91b6f1c4 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 06:27:34 -0600 Subject: [PATCH 30/94] fix(torch-extras): Use separate build argument for DS-Kernels arches [skip ci] --- torch-extras/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index d7af221c..dfe0b05c 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -4,6 +4,7 @@ ARG BASE_IMAGE ARG DEEPSPEED_VERSION="0.14.4" ARG APEX_COMMIT="23c1f86520e22b505e8fdfcf6298273dff2d93d8" ARG DEEPSPEED_KERNELS_COMMIT="e77acc40b104696d4e73229b787d1ef29a9685b1" +ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST="80;86;89;90" ARG XFORMERS_VERSION="0.0.28.post1" ARG BUILD_MAX_JOBS="" @@ -115,9 +116,10 @@ ARG BUILD_MAX_JOBS FROM builder-base as deepspeed-builder +ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST RUN --mount=type=bind,from=ds-kernels-downloader,source=/git/ds-kernels,target=ds-kernels/,rw \ cd ds-kernels && \ - export CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" && \ + export CUDA_ARCH_LIST="${DEEPSPEED_KERNELS_CUDA_ARCH_LIST}" && \ echo "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}" && \ python3 -m pip wheel -w /wheels \ --no-cache-dir --no-build-isolation --no-deps . && \ From a3e444d7f1f4ffb9dc6de12165c8013dd936fc09 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 06:34:14 -0600 Subject: [PATCH 31/94] build(torch-extras): Install `py-cpuinfo` before building DeepSpeed [skip ci] --- torch-extras/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index dfe0b05c..552b7888 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -151,7 +151,7 @@ ARG DEEPSPEED_VERSION SHELL ["/bin/bash", "-o", "pipefail", "-c"] RUN python3 -m pip install -U --no-cache-dir \ - setuptools wheel pip deepspeed-kernels && \ + setuptools wheel pip deepspeed-kernels py-cpuinfo && \ if python3 -m pip show torch | grep 'Version: 2\.[1-9]' > /dev/null; then \ # DeepSpeed's AIO extension is incompatible with PyTorch 2.1.x's # requirement for C++17 (as of DeepSpeed 0.10.1). From cc76549895d5c4b00fa6298d6f0904013e8d7374 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 13:17:07 -0600 Subject: [PATCH 32/94] build(torch): Allow setting `MAX_JOBS` when building Triton --- torch/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 73cbb18a..cd06d352 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -300,10 +300,11 @@ ENV CMAKE_PREFIX_PATH=/usr/bin/ \ CUDNN_LIB_DIR=/usr/local/cuda/lib64 ARG BUILD_TRITON +ARG BUILD_MAX_JOBS="" RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \ --mount=type=cache,target=/ccache \ if [ "$BUILD_TRITON" = '1' ]; then \ - export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ cd triton/python && \ python3 -m pip wheel -w ../../dist/ --no-build-isolation --no-deps -vv . && \ pip3 install ../../dist/*.whl; \ @@ -340,7 +341,6 @@ ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271). # Without WITH_BLAS, it would detect the BLAS implementation as # BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either. -ARG BUILD_MAX_JOBS="" RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \ --mount=type=cache,target=/ccache \ export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ From a444c4439c1c1098b33f7b513bcb9ff8ecc28441 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 13:18:56 -0600 Subject: [PATCH 33/94] build(torch): Allow configuring `NVCC_APPEND_FLAGS` as a build argument --- torch/Dockerfile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index cd06d352..a1dc3370 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -315,6 +315,9 @@ ARG BUILD_TORCH_CUDA_ARCH_LIST ENV TORCH_VERSION=$BUILD_TORCH_VERSION ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST +ARG BUILD_NVCC_APPEND_FLAGS="" +ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BUILD_NVCC_APPEND_FLAGS}" + # If the directory /opt/nccl-tests exists, # the base image is assumed to be nccl-tests, # so it uses the system's special NCCL and UCC installations for the build. @@ -371,13 +374,11 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch WITH_BLAS=FLAME \ PYTORCH_BUILD_VERSION="$(../version-string.sh "$TORCH_VERSION")" \ PYTORCH_BUILD_NUMBER=0 \ - TORCH_NVCC_FLAGS="-Xfatbin -compress-all -diag-suppress 191,186,177" \ + TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \ | grep -Ev --line-buffered '^(ptxas /tmp/|copying .+/|creating build/)' RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl -ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177" - RUN python3 -m pip install -U --no-cache-dir \ packaging setuptools wheel pip From a8bbb8a655bd3fa91b60d007b71fddeecc12dadb Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 13:21:54 -0600 Subject: [PATCH 34/94] build(torch-extras): Don't install `cuda-nvprof` before building [skip ci] --- torch-extras/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 552b7888..46747c6a 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -47,7 +47,6 @@ RUN export \ libcublas-dev-${CUDA_PACKAGE_VERSION} \ libcusparse-dev-${CUDA_PACKAGE_VERSION} \ libcusolver-dev-${CUDA_PACKAGE_VERSION} \ - cuda-nvprof-${CUDA_PACKAGE_VERSION} \ cuda-profiler-api-${CUDA_PACKAGE_VERSION} \ cuda-nvtx-${CUDA_PACKAGE_VERSION} \ cuda-nvrtc-dev-${CUDA_PACKAGE_VERSION} && \ From de9a3de9b395d0904af43b43a27cb5ee967c9a3a Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 13:57:34 -0600 Subject: [PATCH 35/94] build(torch): Conditionally enable `USE_PRIORITIZED_TEXT_FOR_LD` [skip ci] --- torch/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch/Dockerfile b/torch/Dockerfile index a1dc3370..d82a23ea 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -354,6 +354,9 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch mkdir build && \ ln -s /usr/bin/cc build/cc && \ ln -s /usr/bin/c++ build/c++ && \ + if [ "$(uname -m)" = 'aarch64' ]; then \ + export USE_PRIORITIZED_TEXT_FOR_LD=1; \ + fi && \ { if [ -d /opt/nccl-tests ]; then \ export \ USE_DISTRIBUTED=1 \ From a889d618b229e6c1f8eae3d52ccd598658b0070d Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 14:21:20 -0600 Subject: [PATCH 36/94] build(torch): Don't use `lld` on `aarch64` Only standard ld is compatible with USE_PRIORITIZED_TEXT_FOR_LD [skip ci] --- torch/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index d82a23ea..87e0924b 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -209,7 +209,9 @@ RUN CODENAME="$(lsb_release -cs)" && \ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \ update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \ update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \ - update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1 + if [ "$(uname -m)" != 'aarch64' ]; then \ + update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1; \ + fi # Install AOCL-BLAS and AOCL-LAPACK # See: https://www.amd.com/en/developer/aocl/dense.html From 86c27ff5e3c55ac7c56a3d6e271aaf25e3290562 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 18 Nov 2024 14:27:32 -0600 Subject: [PATCH 37/94] build(torch): Set `-eo pipefail` for the PyTorch build command [skip ci] --- torch/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch/Dockerfile b/torch/Dockerfile index 87e0924b..2ee2e244 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -346,6 +346,7 @@ ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BU # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271). # Without WITH_BLAS, it would detect the BLAS implementation as # BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either. +SHELL ["/bin/bash", "-eo", "pipefail", "-c"] RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \ --mount=type=cache,target=/ccache \ export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ @@ -382,6 +383,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \ | grep -Ev --line-buffered '^(ptxas /tmp/|copying .+/|creating build/)' +SHELL ["/bin/sh", "-c"] RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl RUN python3 -m pip install -U --no-cache-dir \ From 3b14ffb0cb5c8f0f8f0d9ad5fed5449d561223b2 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 19 Nov 2024 15:14:40 -0600 Subject: [PATCH 38/94] feat(torch): Update TransformerEngine to v1.12 --- torch/Dockerfile | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 2ee2e244..dea32e35 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -5,7 +5,7 @@ ARG FINAL_BASE_IMAGE="nvidia/cuda:12.4.1-base-ubuntu22.04" ARG BUILD_TORCH_VERSION="2.5.1" ARG BUILD_TORCH_VISION_VERSION="0.20.0" ARG BUILD_TORCH_AUDIO_VERSION="2.5.0" -ARG BUILD_TRANSFORMERENGINE_VERSION="1.11" +ARG BUILD_TRANSFORMERENGINE_VERSION="1.12" ARG BUILD_FLASH_ATTN_VERSION="2.6.3" ARG BUILD_FLASH_ATTN_3="1" ARG BUILD_TRITON_VERSION="" @@ -81,13 +81,6 @@ FROM downloader-base as transformerengine-downloader ARG BUILD_TRANSFORMERENGINE_VERSION RUN ./clone.sh NVIDIA/TransformerEngine TransformerEngine "${BUILD_TRANSFORMERENGINE_VERSION}" -# Include a patch commit that is sort-of part of v1.11 but isn't in their v1.11 release git tag -# See https://github.com/NVIDIA/TransformerEngine/pull/1222 -RUN if [ "${BUILD_TRANSFORMERENGINE_VERSION}" = '1.11' ]; then \ - wget 'https://github.com/NVIDIA/TransformerEngine/commit/fc034785f5e3a5bc5600a88766d9a1d75137ce77.patch' -qO- \ - | git -C TransformerEngine apply -v --stat --apply -; \ - fi - FROM downloader-base as flash-attn-downloader ARG BUILD_FLASH_ATTN_VERSION RUN ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_VERSION}" From bb07aedbc0de403f6765c420eee20b2738dcd4f1 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 19 Nov 2024 15:15:37 -0600 Subject: [PATCH 39/94] build(torch): Use `ccache` more often --- torch/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch/Dockerfile b/torch/Dockerfile index dea32e35..19736455 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -466,6 +466,7 @@ ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST ENV NVTE_CUDA_ARCHS=$BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerEngine,target=TransformerEngine/,rw \ + --mount=type=cache,target=/ccache \ export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \ export MAX_JOBS="${BUILD_MAX_JOBS:-MAX_JOBS}" && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ @@ -482,6 +483,7 @@ RUN rm ./dist/* ARG BUILD_FLASH_ATTN_3 SHELL ["/bin/bash", "-o", "pipefail", "-c"] RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \ + --mount=type=cache,target=/ccache \ export CC=$(realpath -e ./compiler) \ MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)" \ PYTHONUNBUFFERED=1 \ From 65433acd72130f3a663f901a15eda26154ba3ae5 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 19 Nov 2024 15:16:49 -0600 Subject: [PATCH 40/94] feat(torch): Compile `flash-attn` 3 as a separate package [skip ci] --- torch/Dockerfile | 77 +++++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 19736455..c960dba4 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -7,7 +7,7 @@ ARG BUILD_TORCH_VISION_VERSION="0.20.0" ARG BUILD_TORCH_AUDIO_VERSION="2.5.0" ARG BUILD_TRANSFORMERENGINE_VERSION="1.12" ARG BUILD_FLASH_ATTN_VERSION="2.6.3" -ARG BUILD_FLASH_ATTN_3="1" +ARG BUILD_FLASH_ATTN_3_VERSION="2.7.0.post2" ARG BUILD_TRITON_VERSION="" ARG BUILD_TRITON="1" ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX" @@ -85,6 +85,14 @@ FROM downloader-base as flash-attn-downloader ARG BUILD_FLASH_ATTN_VERSION RUN ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_VERSION}" +FROM downloader-base as flash-attn-3-downloader +ARG BUILD_FLASH_ATTN_3_VERSION +RUN if [ -n "$BUILD_FLASH_ATTN_3_VERSION" ]; then \ + ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_3_VERSION}"; \ + else \ + mkdir flash-attention; \ + fi + FROM downloader-base as triton-version ENV TRITON_COMMIT_FILE='.ci/docker/ci_commit_pins/triton.txt' COPY --link --from=pytorch-downloader "/git/pytorch/${TRITON_COMMIT_FILE}" /git/version.txt @@ -477,43 +485,52 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE fi && \ python3 setup.py bdist_wheel --dist-dir /build/dist -FROM builder-base as flash-attn-builder +FROM builder-base as flash-attn-builder-base RUN rm ./dist/* +ENV PYTHONUNBUFFERED=1 +ENV FLASH_ATTENTION_FORCE_BUILD=TRUE +ARG BUILD_FLASH_ATTN_MAX_JOBS="" + +COPY <<-"EOT" /build/fa-build.sh + #!/bin/bash + set -eo pipefail; + if [ -n "$1" ]; then cd "$1"; fi; + python3 setup.py bdist_wheel --dist-dir /build/dist \ + | grep -Ev --line-buffered '^ptxas /tmp/|ptxas info\s*:|bytes spill stores' +EOT +RUN chmod 755 /build/fa-build.sh + +FROM flash-attn-builder-base as flash-attn-builder -ARG BUILD_FLASH_ATTN_3 -SHELL ["/bin/bash", "-o", "pipefail", "-c"] +# Build flash-attn RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \ --mount=type=cache,target=/ccache \ export CC=$(realpath -e ./compiler) \ - MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)" \ - PYTHONUNBUFFERED=1 \ - FLASH_ATTENTION_FORCE_BUILD='TRUE' && \ + MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)}" && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ cd flash-attention && \ - if [ "$BUILD_FLASH_ATTN_3" = 1 ]; then \ - FA3_DIR="hopper"; \ - else \ - FA3_DIR=""; \ + for EXT_DIR in $(realpath -s -e \ + . \ + csrc/ft_attention \ + csrc/fused_dense_lib \ + csrc/fused_softmax \ + csrc/layer_norm \ + csrc/rotary \ + csrc/xentropy); \ + do /build/fa-build.sh; done + +FROM flash-attn-builder-base as flash-attn-3-builder + +# Build flash-attn v3 +RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,target=flash-attention/,rw \ + --mount=type=cache,target=/ccache \ + if [ ! -d flash-attention/hopper ]; then \ + echo "Not compiling flash-attn v3" && exit 0; \ fi && \ - ( \ - for EXT_DIR in $(realpath -s -e \ - . \ - $FA3_DIR \ - csrc/ft_attention \ - csrc/fused_dense_lib \ - csrc/fused_softmax \ - csrc/layer_norm \ - csrc/rotary \ - csrc/xentropy); \ - do \ - cd $EXT_DIR && \ - python3 setup.py bdist_wheel --dist-dir /build/dist && \ - cd - || \ - exit 1; \ - done; \ - ) | \ - grep -Ev --line-buffered 'ptxas info\s*:|bytes spill stores' -SHELL ["/bin/sh", "-c"] + export CC=$(realpath -e ./compiler) \ + MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 10 6)}" && \ + echo "MAX_JOBS: ${MAX_JOBS}" && \ + /build/fa-build.sh flash-attention/hopper FROM builder-base as builder COPY --link --from=torchaudio-builder /build/dist/ /build/dist/ From 69be6f1081df2572d676a336778b83d55bc542a8 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 19 Nov 2024 15:39:00 -0600 Subject: [PATCH 41/94] build(torch): Use tabs for heredoc indentation --- torch/Dockerfile | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index c960dba4..352a0ae3 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -31,16 +31,16 @@ COPY <<-"EOT" /git/clone.sh DEST="$2"; REF="$3"; - CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; }; - - # Try cloning REF as a tag prefixed with "v", otherwise fall back - # to git checkout for commit hashes - CLONE --recurse-submodules --shallow-submodules --also-filter-submodules --no-tags \ - "$REPO" -b "v$REF" "$DEST" || { \ - CLONE --no-single-branch --no-checkout "$REPO" "$DEST" && \ - git -C "$DEST" checkout "$REF" && \ - git -C "$DEST" submodule update --init --filter=blob:none --depth=1 --recursive --jobs 8; \ - }; + CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; }; + + # Try cloning REF as a tag prefixed with "v", otherwise fall back + # to git checkout for commit hashes + CLONE --recurse-submodules --shallow-submodules --also-filter-submodules --no-tags \ + "$REPO" -b "v$REF" "$DEST" || { \ + CLONE --no-single-branch --no-checkout "$REPO" "$DEST" && \ + git -C "$DEST" checkout "$REF" && \ + git -C "$DEST" submodule update --init --filter=blob:none --depth=1 --recursive --jobs 8; \ + }; EOT RUN chmod 755 /git/clone.sh @@ -492,11 +492,11 @@ ENV FLASH_ATTENTION_FORCE_BUILD=TRUE ARG BUILD_FLASH_ATTN_MAX_JOBS="" COPY <<-"EOT" /build/fa-build.sh - #!/bin/bash - set -eo pipefail; - if [ -n "$1" ]; then cd "$1"; fi; - python3 setup.py bdist_wheel --dist-dir /build/dist \ - | grep -Ev --line-buffered '^ptxas /tmp/|ptxas info\s*:|bytes spill stores' + #!/bin/bash + set -eo pipefail; + if [ -n "$1" ]; then cd "$1"; fi; + python3 setup.py bdist_wheel --dist-dir /build/dist \ + | grep -Ev --line-buffered '^ptxas /tmp/|ptxas info\s*:|bytes spill stores' EOT RUN chmod 755 /build/fa-build.sh From ecfac12e014e8ca4321117d8ec458e969b88d0ae Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 19 Nov 2024 15:39:27 -0600 Subject: [PATCH 42/94] build(torch): Invoke `fa-build.sh` correctly --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 352a0ae3..6351688c 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -517,7 +517,7 @@ RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,tar csrc/layer_norm \ csrc/rotary \ csrc/xentropy); \ - do /build/fa-build.sh; done + do /build/fa-build.sh "$EXT_DIR" || exit -1; done FROM flash-attn-builder-base as flash-attn-3-builder From a7d7e04066416e279869f68fc2c9ffa0d106a5cb Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 19 Nov 2024 15:50:22 -0600 Subject: [PATCH 43/94] build(torch): Use tabs for other heredocs --- torch/Dockerfile | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 6351688c..33e02d1f 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -26,10 +26,10 @@ WORKDIR /git RUN git config --global advice.detachedHead false COPY <<-"EOT" /git/clone.sh - #!/bin/sh - REPO="https://github.com/$1"; - DEST="$2"; - REF="$3"; + #!/bin/sh + REPO="https://github.com/$1"; + DEST="$2"; + REF="$3"; CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; }; @@ -262,31 +262,31 @@ RUN if [ "$(uname -m)" = "aarch64" ]; then \ gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 COPY <<-"EOT" /build/version-string.sh - #!/bin/sh - set -e; - VERSION="$1"; - - IS_HASH() { - echo "$1" | grep -qxiEe '[0-9a-f]{40}'; - }; - - if IS_HASH "$VERSION"; then - REAL_VERSION="$(cat ./version.txt)"; - SHORT_HASH="$(echo "$VERSION" | cut -c1-7)"; - echo "$REAL_VERSION+$SHORT_HASH"; - else - echo "$VERSION"; - fi; + #!/bin/sh + set -e; + VERSION="$1"; + + IS_HASH() { + echo "$1" | grep -qxiEe '[0-9a-f]{40}'; + }; + + if IS_HASH "$VERSION"; then + REAL_VERSION="$(cat ./version.txt)"; + SHORT_HASH="$(echo "$VERSION" | cut -c1-7)"; + echo "$REAL_VERSION+$SHORT_HASH"; + else + echo "$VERSION"; + fi; EOT RUN chmod 755 /build/version-string.sh COPY <<-"EOT" /build/storage-info.sh - #!/bin/sh - set -e; - TARGET="$(realpath "$1")"; + #!/bin/sh + set -e; + TARGET="$(realpath "$1")"; - STORAGE_INFO="$(df -h '--output=fstype,used,avail,pcent,target' "$TARGET")" || exit 0; - printf 'Storage info for %s:\n%s\n' "$TARGET" "$STORAGE_INFO"; + STORAGE_INFO="$(df -h '--output=fstype,used,avail,pcent,target' "$TARGET")" || exit 0; + printf 'Storage info for %s:\n%s\n' "$TARGET" "$STORAGE_INFO"; EOT RUN chmod 755 /build/storage-info.sh From 4e48edbcbc44f201596adbde29df3ee0d7320031 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 19 Nov 2024 15:50:29 -0600 Subject: [PATCH 44/94] build(torch): Use the resulting artifact from `flash-attn-3-builder` [skip ci] --- torch/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/Dockerfile b/torch/Dockerfile index 33e02d1f..3b8e36e7 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -537,6 +537,7 @@ COPY --link --from=torchaudio-builder /build/dist/ /build/dist/ COPY --link --from=torchvision-builder /build/dist/ /build/dist/ COPY --link --from=transformerengine-builder /build/dist/ /build/dist/ COPY --link --from=flash-attn-builder /build/dist/ /build/dist/ +COPY --link --from=flash-attn-3-builder /build/dist/ /build/dist/ ## Build the final torch image. FROM ${FINAL_BASE_IMAGE} From ac1610abdfafe5e18bb342d94be417e7b3393555 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 19 Nov 2024 16:54:01 -0600 Subject: [PATCH 45/94] build(torch): Filter more lines while building `flash-attn` --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 3b8e36e7..3170cf4b 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -496,7 +496,7 @@ COPY <<-"EOT" /build/fa-build.sh set -eo pipefail; if [ -n "$1" ]; then cd "$1"; fi; python3 setup.py bdist_wheel --dist-dir /build/dist \ - | grep -Ev --line-buffered '^ptxas /tmp/|ptxas info\s*:|bytes spill stores' + | grep -Ev --line-buffered '^ptxas (/tmp/|(info|warning)\s*:)|bytes spill stores' EOT RUN chmod 755 /build/fa-build.sh From 27f196498f8df4989a91382f9f7c3c23da608423 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 19 Nov 2024 16:57:29 -0600 Subject: [PATCH 46/94] build(torch): Build `flash-attn` and `flash-attn` 3 in sequence [skip ci] --- torch/Dockerfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/torch/Dockerfile b/torch/Dockerfile index 3170cf4b..91f75cc1 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -521,6 +521,10 @@ RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,tar FROM flash-attn-builder-base as flash-attn-3-builder +# Artifically sequence this build stage after the previous one +# to prevent parallelism, because these are both very resource-intensive +RUN --mount=type=bind,from=flash-attn-builder,source-/build,target=/build : + # Build flash-attn v3 RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,target=flash-attention/,rw \ --mount=type=cache,target=/ccache \ From 83367bcaccd1780489698a957b6da68d7a8f23bc Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 19 Nov 2024 16:58:36 -0600 Subject: [PATCH 47/94] fix(torch): Fix typo in bind mount's `source=` parameter [skip ci] --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 91f75cc1..99983c7a 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -523,7 +523,7 @@ FROM flash-attn-builder-base as flash-attn-3-builder # Artifically sequence this build stage after the previous one # to prevent parallelism, because these are both very resource-intensive -RUN --mount=type=bind,from=flash-attn-builder,source-/build,target=/build : +RUN --mount=type=bind,from=flash-attn-builder,source=/build,target=/build : # Build flash-attn v3 RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,target=flash-attention/,rw \ From 915e47ecbca88b4255784ab9bb16c52de7d82e95 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 19 Nov 2024 17:13:17 -0600 Subject: [PATCH 48/94] fix(torch): Use `exit 1` instead of `exit -1` [skip ci] --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 99983c7a..d2604b38 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -517,7 +517,7 @@ RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,tar csrc/layer_norm \ csrc/rotary \ csrc/xentropy); \ - do /build/fa-build.sh "$EXT_DIR" || exit -1; done + do /build/fa-build.sh "$EXT_DIR" || exit 1; done FROM flash-attn-builder-base as flash-attn-3-builder From 7162b4463d65986c8c9374e4cc6181155cf854cf Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 25 Nov 2024 21:13:47 -0600 Subject: [PATCH 49/94] fix(torch): Broaden criteria to apply PyTorch patch --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index d2604b38..38f0f72a 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -50,7 +50,7 @@ FROM downloader-base as pytorch-downloader ARG BUILD_TORCH_VERSION # Includes a patch for a foreach bug in PyTorch v2.5.1 RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \ - if [ "${BUILD_TORCH_VERSION}" = '2.5.1' ]; then \ + if [ "${BUILD_TORCH_VERSION}" != '2.6.0' ]; then \ wget 'https://github.com/pytorch/pytorch/commit/1cdaf1d85f5e4b3f8952fd0737a1afeb16995d13.patch' -qO- \ | git -C pytorch apply; \ fi && \ From d6e73e7edf90150d2169d65d3a2841e05cfb1509 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 25 Nov 2024 21:16:22 -0600 Subject: [PATCH 50/94] feat(torch): Force compilation for compute capability 9.0a --- torch-extras/Dockerfile | 2 +- torch/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 46747c6a..5c8ace94 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -108,7 +108,7 @@ RUN if [ "$(uname -m)" = "aarch64" ]; then \ COPY --chmod=755 effective_cpu_count.sh . COPY --chmod=755 scale.sh . -ARG BUILD_NVCC_APPEND_FLAGS="" +ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=compute_90a" ENV NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS:+$BUILD_NVCC_APPEND_FLAGS }-diag-suppress 186,177" ARG BUILD_MAX_JOBS diff --git a/torch/Dockerfile b/torch/Dockerfile index 38f0f72a..7ea2807d 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -318,7 +318,7 @@ ARG BUILD_TORCH_CUDA_ARCH_LIST ENV TORCH_VERSION=$BUILD_TORCH_VERSION ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST -ARG BUILD_NVCC_APPEND_FLAGS="" +ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=compute_90a" ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BUILD_NVCC_APPEND_FLAGS}" # If the directory /opt/nccl-tests exists, From b50c6f230db6053105506c6631c33efe178efb39 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 25 Nov 2024 21:21:59 -0600 Subject: [PATCH 51/94] fix(torch): Restore original criteria to apply PyTorch v2.5.1 patch --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 7ea2807d..a8281183 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -50,7 +50,7 @@ FROM downloader-base as pytorch-downloader ARG BUILD_TORCH_VERSION # Includes a patch for a foreach bug in PyTorch v2.5.1 RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \ - if [ "${BUILD_TORCH_VERSION}" != '2.6.0' ]; then \ + if [ "${BUILD_TORCH_VERSION}" = '2.5.1' ]; then \ wget 'https://github.com/pytorch/pytorch/commit/1cdaf1d85f5e4b3f8952fd0737a1afeb16995d13.patch' -qO- \ | git -C pytorch apply; \ fi && \ From 626b44dce0e6ab35a87e6f47a46ac8795af5ce38 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 25 Nov 2024 21:30:14 -0600 Subject: [PATCH 52/94] feat(torch): Specify string preprocessor definitions correctly --- torch-extras/Dockerfile | 6 +++--- torch/Dockerfile | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 5c8ace94..2af92114 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -97,11 +97,11 @@ COPY compiler_wrapper.f95 . ARG AMD64_NATIVE_ARCH="skylake" ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres" RUN if [ "$(uname -m)" = "aarch64" ]; then \ - NATIVE="${ARM64_NATIVE_ARCH}" && \ + NATIVE="WRAPPER_NATIVE=\"${ARM64_NATIVE_ARCH}\"" && \ AVX='WRAPPER_NO_AVX'; \ else \ - NATIVE="${AMD64_NATIVE_ARCH}" && \ - AVX='WRAPPER_AVX=AVX256'; \ + NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \ + AVX='WRAPPER_AVX="AVX256"'; \ fi && \ gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 diff --git a/torch/Dockerfile b/torch/Dockerfile index a8281183..29f78b28 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -253,11 +253,11 @@ COPY compiler_wrapper.f95 . ARG AMD64_NATIVE_ARCH="skylake" ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres" RUN if [ "$(uname -m)" = "aarch64" ]; then \ - NATIVE="${ARM64_NATIVE_ARCH}" && \ + NATIVE="WRAPPER_NATIVE=\"${ARM64_NATIVE_ARCH}\"" && \ AVX='WRAPPER_NO_AVX'; \ else \ - NATIVE="${AMD64_NATIVE_ARCH}" && \ - AVX='WRAPPER_AVX=AVX256'; \ + NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \ + AVX='WRAPPER_AVX="AVX256"'; \ fi && \ gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 From 762021f9164bfe2401999cedf2884eaf92827860 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 25 Nov 2024 21:40:49 -0600 Subject: [PATCH 53/94] fix(torch): Install `pybind11` before attempting to build Triton --- torch/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/Dockerfile b/torch/Dockerfile index 29f78b28..07024996 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -307,6 +307,7 @@ ARG BUILD_MAX_JOBS="" RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \ --mount=type=cache,target=/ccache \ if [ "$BUILD_TRITON" = '1' ]; then \ + pip3 install --no-cache-dir pybind11 && \ export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ cd triton/python && \ python3 -m pip wheel -w ../../dist/ --no-build-isolation --no-deps -vv . && \ From 04cfc691be58adbb59f67f7928ceb8f3044e8174 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 25 Nov 2024 22:48:29 -0600 Subject: [PATCH 54/94] build(torch): Add missing `$` in `MAX_JOBS` default for TE --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 07024996..be5464e7 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -477,7 +477,7 @@ ENV NVTE_CUDA_ARCHS=$BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerEngine,target=TransformerEngine/,rw \ --mount=type=cache,target=/ccache \ export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \ - export MAX_JOBS="${BUILD_MAX_JOBS:-MAX_JOBS}" && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-$MAX_JOBS}" && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ cd TransformerEngine && \ if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \ From e6dac89c4205cff3a2c57d4b2ed90d0a3946d5b7 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 26 Nov 2024 12:23:17 -0600 Subject: [PATCH 55/94] ci(torch): Drop CUDA 12.2.2 build --- .github/configurations/torch-base.yml | 2 +- .github/configurations/torch-nccl.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml index 4c94e304..cb727713 100644 --- a/.github/configurations/torch-base.yml +++ b/.github/configurations/torch-base.yml @@ -1,4 +1,4 @@ -cuda: [ 12.6.2, 12.4.1, 12.2.2 ] +cuda: [ 12.6.2, 12.4.1 ] os: [ ubuntu22.04, ubuntu20.04 ] include: - torch: 2.5.1 diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 74da04cc..f6dfd26f 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,4 +1,4 @@ -cuda: [ 12.6.2, 12.4.1, 12.2.2 ] +cuda: [ 12.6.2, 12.4.1 ] os: [ ubuntu22.04, ubuntu20.04 ] include: - torch: 2.5.1 From 56f06edd96b76ca2ab1bd0ac18dda3139793e418 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 26 Nov 2024 12:26:32 -0600 Subject: [PATCH 56/94] feat(torch-extras): Update Apex to `a1df804` --- torch-extras/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 2af92114..0b967a0b 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -2,7 +2,7 @@ ARG BASE_IMAGE ARG DEEPSPEED_VERSION="0.14.4" -ARG APEX_COMMIT="23c1f86520e22b505e8fdfcf6298273dff2d93d8" +ARG APEX_COMMIT="a1df80457ba67d60cbdb0d3ddfb08a2702c821a8" ARG DEEPSPEED_KERNELS_COMMIT="e77acc40b104696d4e73229b787d1ef29a9685b1" ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST="80;86;89;90" ARG XFORMERS_VERSION="0.0.28.post1" From bcd5fabf67f8d1c42071a0c20fec50c3436b366f Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 2 Dec 2024 19:46:59 -0600 Subject: [PATCH 57/94] feat(torch): Update `torch:nccl` base images for HPC-X v2.21 --- .github/configurations/torch-nccl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index f6dfd26f..94d85d47 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -5,4 +5,4 @@ include: vision: 0.20.0 audio: 2.5.0 nccl: 2.23.4-1 - nccl-tests-hash: c58f522 + nccl-tests-hash: 3ef8839 From c14235b0e33370ac22f0e4afc3882217b765a273 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 3 Dec 2024 17:09:53 -0600 Subject: [PATCH 58/94] ci: Update to newer self-hosted runners [skip ci] --- .github/workflows/build.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 79191ce7..fdb4cc41 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -34,7 +34,7 @@ jobs: build: name: Build Images runs-on: [ cw ] - container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0' + container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.6.0' defaults: run: shell: bash @@ -57,10 +57,21 @@ jobs: with: driver: remote endpoint: ${{ secrets.BUILDKIT_CONSUMER_ENDPOINT }} + append: | + - endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }} + platforms: linux/amd64 + - endpoint: ${{ secrets.BUILDKIT_CONSUMER_ARM64_ENDPOINT }} + platforms: linux/arm64 env: BUILDER_NODE_0_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }} BUILDER_NODE_0_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }} BUILDER_NODE_0_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }} + BUILDER_NODE_1_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }} + BUILDER_NODE_1_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }} + BUILDER_NODE_1_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }} + BUILDER_NODE_2_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }} + BUILDER_NODE_2_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }} + BUILDER_NODE_2_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }} - name: Get base registry run: | echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV From 4f64cf128f208e660d0193b12e2d40142147090e Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 3 Dec 2024 17:10:54 -0600 Subject: [PATCH 59/94] ci(torch): Update CUDA 12.6 builds to 12.6.3; update `torch:nccl` bases --- .github/configurations/torch-base.yml | 2 +- .github/configurations/torch-nccl.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml index cb727713..0ff20e33 100644 --- a/.github/configurations/torch-base.yml +++ b/.github/configurations/torch-base.yml @@ -1,4 +1,4 @@ -cuda: [ 12.6.2, 12.4.1 ] +cuda: [ 12.6.3, 12.4.1 ] os: [ ubuntu22.04, ubuntu20.04 ] include: - torch: 2.5.1 diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 94d85d47..bfa98fe2 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,8 +1,8 @@ -cuda: [ 12.6.2, 12.4.1 ] +cuda: [ 12.6.3, 12.4.1 ] os: [ ubuntu22.04, ubuntu20.04 ] include: - torch: 2.5.1 vision: 0.20.0 audio: 2.5.0 nccl: 2.23.4-1 - nccl-tests-hash: 3ef8839 + nccl-tests-hash: 007a325 From 0dcf27deab369d1778b24399e6f792d237594b2a Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 3 Dec 2024 17:15:49 -0600 Subject: [PATCH 60/94] fix(torch): Edit `flash-attn` 3 installation for compatibility with TE TransformerEngine expects flash-attn 3 to be importable via: import flashattn_hopper.flash_attn_interface Which requires special shenanigans to support properly. This change adds that path as a symlink to the original flash_attn_interface file, and registers it as belonging to the flashattn-hopper distribution so that it can be uninstalled correctly if needed. --- torch/Dockerfile | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/torch/Dockerfile b/torch/Dockerfile index be5464e7..cb0f20aa 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -627,3 +627,51 @@ WORKDIR /usr/src/app RUN --mount=type=bind,from=builder,source=/build/dist,target=. \ pip3 install --no-cache-dir -U numpy packaging && \ pip3 install --no-cache-dir -U ./*.whl + +# Make a symlink to flash-attn v3 where TransformerEngine expects it, +# and modify the installation record so that pip uninstall knows how to +# fully remove it. +RUN <<-"EOT" + #!/bin/env python3 + from base64 import urlsafe_b64encode as b64 + from hashlib import sha256 + from importlib import metadata + from pathlib import Path + from py_compile import compile + + dist = metadata.distribution("flashattn-hopper") + p = dist.locate_file("flash_attn_interface.py") + print("flash_attn_interface:", p) + root = p.parent + + if not p.exists(): + raise SystemExit("flash_attn_interface not found") + if not p.is_file(): + raise SystemExit("flash_attn_interface path is not a file") + + d = root / "flashattn_hopper" + if d.exists(): + raise SystemExit(f'"{d}" already exists') + + d.mkdir(mode=0o755, parents=False, exist_ok=False) + new = d / p.name + new.symlink_to(p) + print(f"Created new symlink at {new}") + + compiled = Path(compile(new)) + + + def record_entry(path: Path) -> str: + content = path.read_bytes() + digest = b64(sha256(content).digest()).rstrip(b"=").decode() + package_path = path.relative_to(root).as_posix() + return f"{package_path},sha256={digest},{len(content):d}\r\n" + + + for f in dist.files: + if f.match("flashattn?hopper-*.dist-info/RECORD"): + with f.locate().open("a", encoding="utf-8", newline="") as record: + for added in (new, compiled): + record.write(record_entry(added)) + break +EOT From fd6df40bb43433905de96b47db1c119eef21240f Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 16 Dec 2024 14:10:57 -0600 Subject: [PATCH 61/94] fix(torch): Add redundant interpreter specification for compatibility --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index cb0f20aa..4ebf58b6 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -631,7 +631,7 @@ RUN --mount=type=bind,from=builder,source=/build/dist,target=. \ # Make a symlink to flash-attn v3 where TransformerEngine expects it, # and modify the installation record so that pip uninstall knows how to # fully remove it. -RUN <<-"EOT" +RUN <<-"EOT" python3 #!/bin/env python3 from base64 import urlsafe_b64encode as b64 from hashlib import sha256 From ebaf5aa31ae6d09d8cc28528e9178aa968080273 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 16 Dec 2024 15:11:14 -0600 Subject: [PATCH 62/94] feat(torch): Update LLVM components, including `libomp` runtime library --- torch-extras/Dockerfile | 9 +++------ torch/Dockerfile | 20 +++++++++++++++----- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 0b967a0b..30bdc784 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -72,18 +72,15 @@ RUN apt-get -qq update && apt-get -qq install -y \ # Update compiler (GCC) and linker (LLD) versions # gfortran-11 is just for compiler_wrapper.f95 -RUN CODENAME="$(lsb_release -cs)" && \ - wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ - apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \ - apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ +RUN LLVM_VERSION='18' && \ apt-get -qq update && apt-get -qq install --no-install-recommends -y \ - gcc-11 g++-11 gfortran-11 lld-17 && \ + gcc-11 g++-11 gfortran-11 "lld-$LLVM_VERSION" && \ apt-get clean && \ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \ update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \ update-alternatives --install \ /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \ - update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1 + update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1 RUN mkdir /wheels /build WORKDIR /build diff --git a/torch/Dockerfile b/torch/Dockerfile index 4ebf58b6..7a9e6e50 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -197,13 +197,16 @@ ENV CCACHE_DIR=/ccache \ CMAKE_CUDA_COMPILER_LAUNCHER=ccache # Update compiler (GCC) and linker (LLD) versions -RUN CODENAME="$(lsb_release -cs)" && \ +RUN LLVM_VERSION='18' && \ + CODENAME="$(lsb_release -cs)" && \ wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ - apt-add-repository -n "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \ + apt-add-repository -n "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-$LLVM_VERSION main" && \ SETUP_TOOLCHAIN() { \ apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \ | sed -e '/connection timed out/{p; Q1}' && \ - apt-get -qq install --no-install-recommends -y gcc-11 g++-11 gfortran-11 lld-17 && \ + apt-get -qq install --no-install-recommends -y \ + gcc-11 g++-11 gfortran-11 \ + "lld-$LLVM_VERSION" "libomp-$LLVM_VERSION-dev" && \ apt-get clean; \ } && \ { SETUP_TOOLCHAIN || { sleep "$(shuf -i10-20 -n1)" && SETUP_TOOLCHAIN; }; } && \ @@ -211,7 +214,7 @@ RUN CODENAME="$(lsb_release -cs)" && \ update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \ update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \ if [ "$(uname -m)" != 'aarch64' ]; then \ - update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1; \ + update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1; \ fi # Install AOCL-BLAS and AOCL-LAPACK @@ -563,7 +566,7 @@ RUN apt-get -qq update && apt-get -qq install -y \ ldconfig RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \ - software-properties-common && \ + software-properties-common lsb-release && \ SETUP_LIBSTDCXX() { \ apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \ | sed -e '/connection timed out/{p; Q1}' && \ @@ -572,6 +575,13 @@ RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \ } && \ { SETUP_LIBSTDCXX || { sleep "$(shuf -i10-20 -n1)" && SETUP_LIBSTDCXX; }; } +RUN LLVM_VERSION='18' && \ + CODENAME="$(lsb_release -cs)" && \ + wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ + apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-$LLVM_VERSION main" && \ + apt-get -qq install -y --no-install-recommends "libomp5-$LLVM_VERSION" && \ + apt-get clean + # Install AOCL-BLAS and AOCL-LAPACK # See: https://www.amd.com/en/developer/aocl/dense.html ARG AOCL_BASE From ed93c44190e043c8ee9700b97bf0ebdef3d0d207 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 16 Dec 2024 15:32:10 -0600 Subject: [PATCH 63/94] ci: Remove deprecated BuildKit runner endpoint [skip ci] --- .github/workflows/build.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fdb4cc41..2c825b36 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -56,10 +56,9 @@ jobs: uses: docker/setup-buildx-action@v3.7.1 with: driver: remote - endpoint: ${{ secrets.BUILDKIT_CONSUMER_ENDPOINT }} + endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }} + platforms: linux/amd64 append: | - - endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }} - platforms: linux/amd64 - endpoint: ${{ secrets.BUILDKIT_CONSUMER_ARM64_ENDPOINT }} platforms: linux/arm64 env: @@ -69,9 +68,6 @@ jobs: BUILDER_NODE_1_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }} BUILDER_NODE_1_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }} BUILDER_NODE_1_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }} - BUILDER_NODE_2_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }} - BUILDER_NODE_2_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }} - BUILDER_NODE_2_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }} - name: Get base registry run: | echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV From 2da2fb144e5aad8b613f1d62b181963ef591a8e3 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 24 Dec 2024 12:30:22 -0600 Subject: [PATCH 64/94] feat(torch): Upgrade TransformerEngine to v1.13 --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 7a9e6e50..4ee75b02 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -5,7 +5,7 @@ ARG FINAL_BASE_IMAGE="nvidia/cuda:12.4.1-base-ubuntu22.04" ARG BUILD_TORCH_VERSION="2.5.1" ARG BUILD_TORCH_VISION_VERSION="0.20.0" ARG BUILD_TORCH_AUDIO_VERSION="2.5.0" -ARG BUILD_TRANSFORMERENGINE_VERSION="1.12" +ARG BUILD_TRANSFORMERENGINE_VERSION="1.13" ARG BUILD_FLASH_ATTN_VERSION="2.6.3" ARG BUILD_FLASH_ATTN_3_VERSION="2.7.0.post2" ARG BUILD_TRITON_VERSION="" From 09ed20035d253a1f79cd97320d570f38d26a6d73 Mon Sep 17 00:00:00 2001 From: Eta Date: Tue, 24 Dec 2024 12:31:02 -0600 Subject: [PATCH 65/94] fix(torch): Add `-ffree-line-length-512` to `gfortran` invocations [skip ci] --- torch-extras/Dockerfile | 2 +- torch/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 30bdc784..5a6c66f4 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -100,7 +100,7 @@ RUN if [ "$(uname -m)" = "aarch64" ]; then \ NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \ AVX='WRAPPER_AVX="AVX256"'; \ fi && \ - gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 + gfortran -ffree-line-length-512 -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 COPY --chmod=755 effective_cpu_count.sh . COPY --chmod=755 scale.sh . diff --git a/torch/Dockerfile b/torch/Dockerfile index 4ee75b02..a9360dd0 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -262,7 +262,7 @@ RUN if [ "$(uname -m)" = "aarch64" ]; then \ NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \ AVX='WRAPPER_AVX="AVX256"'; \ fi && \ - gfortran -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 + gfortran -ffree-line-length-512 -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 COPY <<-"EOT" /build/version-string.sh #!/bin/sh From 8f95f595b3c9e75b279811042fb12a5b338bb388 Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 9 Jan 2025 17:59:18 -0600 Subject: [PATCH 66/94] feat(torch): Update `flash-attention` 2 & 3 to v2.7.2 --- torch/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index a9360dd0..1aacffbc 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -6,8 +6,8 @@ ARG BUILD_TORCH_VERSION="2.5.1" ARG BUILD_TORCH_VISION_VERSION="0.20.0" ARG BUILD_TORCH_AUDIO_VERSION="2.5.0" ARG BUILD_TRANSFORMERENGINE_VERSION="1.13" -ARG BUILD_FLASH_ATTN_VERSION="2.6.3" -ARG BUILD_FLASH_ATTN_3_VERSION="2.7.0.post2" +ARG BUILD_FLASH_ATTN_VERSION="2.7.2.post1" +ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1" ARG BUILD_TRITON_VERSION="" ARG BUILD_TRITON="1" ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX" From 8e290756f90c6ca5ac5c8e83f02b1c5ed63a45a3 Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 9 Jan 2025 21:58:38 -0600 Subject: [PATCH 67/94] ci(torch): Drop Ubuntu 20.04 CI builds --- .github/configurations/torch-base.yml | 2 +- .github/configurations/torch-nccl.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml index 0ff20e33..d0dcc24f 100644 --- a/.github/configurations/torch-base.yml +++ b/.github/configurations/torch-base.yml @@ -1,5 +1,5 @@ cuda: [ 12.6.3, 12.4.1 ] -os: [ ubuntu22.04, ubuntu20.04 ] +os: [ ubuntu22.04 ] include: - torch: 2.5.1 vision: 0.20.0 diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index bfa98fe2..21c3ed57 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,5 +1,5 @@ cuda: [ 12.6.3, 12.4.1 ] -os: [ ubuntu22.04, ubuntu20.04 ] +os: [ ubuntu22.04 ] include: - torch: 2.5.1 vision: 0.20.0 From 459aa230741312515b82f7e36b89a5ec6db424d6 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 29 Jan 2025 01:27:56 -0600 Subject: [PATCH 68/94] ci: Re-enable multi-arch builds [skip ci] --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2c825b36..6eb3b114 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -109,7 +109,7 @@ jobs: labels: ${{ steps.meta.outputs.labels }} cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }} cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max - platforms: linux/amd64 + platforms: linux/amd64,linux/arm64 - name: Clear registry credentials if: always() run: | From cd1019bd7e3d131755e3c42904ebebb5d3e7d2e6 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 29 Jan 2025 01:28:38 -0600 Subject: [PATCH 69/94] build(torch): Add new build targets with CUDA 12.8.0 --- .github/configurations/torch-base.yml | 2 +- .github/configurations/torch-nccl.yml | 6 +-- torch/Dockerfile | 56 ++++++++++++++++----------- 3 files changed, 37 insertions(+), 27 deletions(-) diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml index d0dcc24f..18d5caa7 100644 --- a/.github/configurations/torch-base.yml +++ b/.github/configurations/torch-base.yml @@ -1,4 +1,4 @@ -cuda: [ 12.6.3, 12.4.1 ] +cuda: [ 12.8.0, 12.6.3, 12.4.1 ] os: [ ubuntu22.04 ] include: - torch: 2.5.1 diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 21c3ed57..0c84b40f 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,8 +1,8 @@ -cuda: [ 12.6.3, 12.4.1 ] +cuda: [ 12.8.0, 12.6.3, 12.4.1 ] os: [ ubuntu22.04 ] include: - torch: 2.5.1 vision: 0.20.0 audio: 2.5.0 - nccl: 2.23.4-1 - nccl-tests-hash: 007a325 + nccl: 2.25.1-1 + nccl-tests-hash: 4e02d6a diff --git a/torch/Dockerfile b/torch/Dockerfile index 1aacffbc..1994ed7a 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -1,6 +1,6 @@ # syntax=docker/dockerfile:1.4 -ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.4.1-devel-ubuntu22.04" -ARG FINAL_BASE_IMAGE="nvidia/cuda:12.4.1-base-ubuntu22.04" +ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.8.0-devel-ubuntu22.04" +ARG FINAL_BASE_IMAGE="nvidia/cuda:12.8.0-base-ubuntu22.04" ARG BUILD_TORCH_VERSION="2.5.1" ARG BUILD_TORCH_VISION_VERSION="0.20.0" @@ -21,7 +21,7 @@ ARG AOCL_URL="https://download.amd.com/developer/eula/aocl/aocl-4-2/aocl-linux-a # Clone PyTorch repositories independently from all other build steps # for cache-friendliness and parallelization -FROM alpine/git:2.40.1 as downloader-base +FROM alpine/git:2.40.1 AS downloader-base WORKDIR /git RUN git config --global advice.detachedHead false @@ -46,7 +46,7 @@ EOT RUN chmod 755 /git/clone.sh -FROM downloader-base as pytorch-downloader +FROM downloader-base AS pytorch-downloader ARG BUILD_TORCH_VERSION # Includes a patch for a foreach bug in PyTorch v2.5.1 RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \ @@ -56,12 +56,12 @@ RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \ fi && \ rm -rf pytorch/.git -FROM downloader-base as torchvision-downloader +FROM downloader-base AS torchvision-downloader ARG BUILD_TORCH_VISION_VERSION RUN ./clone.sh pytorch/vision vision "${BUILD_TORCH_VISION_VERSION}" && \ rm -rf vision/.git -FROM downloader-base as torchaudio-downloader +FROM downloader-base AS torchaudio-downloader ARG BUILD_TORCH_AUDIO_VERSION RUN ./clone.sh pytorch/audio audio "${BUILD_TORCH_AUDIO_VERSION}" # The torchaudio build requires that this directory remain a full git repository, @@ -77,15 +77,15 @@ RUN if grep -qF '#include ' \ fi && \ rm /git/patch -FROM downloader-base as transformerengine-downloader +FROM downloader-base AS transformerengine-downloader ARG BUILD_TRANSFORMERENGINE_VERSION RUN ./clone.sh NVIDIA/TransformerEngine TransformerEngine "${BUILD_TRANSFORMERENGINE_VERSION}" -FROM downloader-base as flash-attn-downloader +FROM downloader-base AS flash-attn-downloader ARG BUILD_FLASH_ATTN_VERSION RUN ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_VERSION}" -FROM downloader-base as flash-attn-3-downloader +FROM downloader-base AS flash-attn-3-downloader ARG BUILD_FLASH_ATTN_3_VERSION RUN if [ -n "$BUILD_FLASH_ATTN_3_VERSION" ]; then \ ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_3_VERSION}"; \ @@ -93,7 +93,7 @@ RUN if [ -n "$BUILD_FLASH_ATTN_3_VERSION" ]; then \ mkdir flash-attention; \ fi -FROM downloader-base as triton-version +FROM downloader-base AS triton-version ENV TRITON_COMMIT_FILE='.ci/docker/ci_commit_pins/triton.txt' COPY --link --from=pytorch-downloader "/git/pytorch/${TRITON_COMMIT_FILE}" /git/version.txt ARG BUILD_TRITON_VERSION @@ -101,7 +101,7 @@ RUN if [ -n "${BUILD_TRITON_VERSION}" ]; then \ echo "${BUILD_TRITON_VERSION}" > /git/version.txt; \ fi -FROM downloader-base as triton-downloader +FROM downloader-base AS triton-downloader COPY --link --from=triton-version /git/version.txt /git/version.txt ARG BUILD_TRITON RUN if [ "${BUILD_TRITON}" = '1' ]; then \ @@ -110,7 +110,7 @@ RUN if [ "${BUILD_TRITON}" = '1' ]; then \ mkdir triton; \ fi -FROM alpine/curl:8.7.1 as aocl-downloader +FROM alpine/curl:8.7.1 AS aocl-downloader WORKDIR /tmp/install RUN apk add --no-cache bash @@ -136,7 +136,7 @@ RUN curl -sSfo- "${AOCL_URL}" | tar xzf - --strip-components 1 && \ ## Build PyTorch on a builder image. -FROM ${BUILDER_BASE_IMAGE} as builder-base +FROM ${BUILDER_BASE_IMAGE} AS builder-base-shared ENV DEBIAN_FRONTEND=noninteractive ARG BUILD_CCACHE_SIZE="1Gi" @@ -215,8 +215,16 @@ RUN LLVM_VERSION='18' && \ update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \ if [ "$(uname -m)" != 'aarch64' ]; then \ update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1; \ - fi + fi && \ + ldconfig + +FROM builder-base-shared AS builder-base-arm64 +# There is currently no CPU BLAS used for ARM builds, +# so this stage is just an alias + + +FROM builder-base-shared AS builder-base-amd64 # Install AOCL-BLAS and AOCL-LAPACK # See: https://www.amd.com/en/developer/aocl/dense.html ARG AOCL_BASE @@ -248,6 +256,8 @@ ENV C_INCLUDE_PATH="${AOCL_BASE}/include${C_INCLUDE_PATH:+:$C_INCLUDE_PATH}" \ LD_LIBRARY_PATH="${AOCL_BASE}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" \ LIBRARY_PATH="${AOCL_BASE}/lib${LIBRARY_PATH:+:$LIBRARY_PATH}" + +FROM builder-base-${TARGETARCH} AS builder-base RUN mkdir /build /build/dist WORKDIR /build COPY --chmod=755 effective_cpu_count.sh . @@ -322,8 +332,8 @@ ARG BUILD_TORCH_CUDA_ARCH_LIST ENV TORCH_VERSION=$BUILD_TORCH_VERSION ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST -ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=compute_90a" -ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BUILD_NVCC_APPEND_FLAGS}" +ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a -gencode=arch=compute_100,code=[sm_100,compute_100]" +ENV NVCC_APPEND_FLAGS="-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BUILD_NVCC_APPEND_FLAGS}" # If the directory /opt/nccl-tests exists, # the base image is assumed to be nccl-tests, @@ -394,7 +404,7 @@ RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl RUN python3 -m pip install -U --no-cache-dir \ packaging setuptools wheel pip -FROM builder-base as torchvision-builder +FROM builder-base AS torchvision-builder RUN rm ./dist/* ## Build torchvision @@ -432,7 +442,7 @@ RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=visi TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ python3 setup.py bdist_wheel --dist-dir ../dist -FROM builder-base as torchaudio-builder +FROM builder-base AS torchaudio-builder RUN rm ./dist/* ## Build torchaudio @@ -470,7 +480,7 @@ RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/ TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ python3 setup.py bdist_wheel --dist-dir ../dist -FROM builder-base as transformerengine-builder +FROM builder-base AS transformerengine-builder RUN rm ./dist/* # Build TransformerEngine @@ -489,7 +499,7 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE fi && \ python3 setup.py bdist_wheel --dist-dir /build/dist -FROM builder-base as flash-attn-builder-base +FROM builder-base AS flash-attn-builder-base RUN rm ./dist/* ENV PYTHONUNBUFFERED=1 ENV FLASH_ATTENTION_FORCE_BUILD=TRUE @@ -504,7 +514,7 @@ COPY <<-"EOT" /build/fa-build.sh EOT RUN chmod 755 /build/fa-build.sh -FROM flash-attn-builder-base as flash-attn-builder +FROM flash-attn-builder-base AS flash-attn-builder # Build flash-attn RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \ @@ -523,7 +533,7 @@ RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,tar csrc/xentropy); \ do /build/fa-build.sh "$EXT_DIR" || exit 1; done -FROM flash-attn-builder-base as flash-attn-3-builder +FROM flash-attn-builder-base AS flash-attn-3-builder # Artifically sequence this build stage after the previous one # to prevent parallelism, because these are both very resource-intensive @@ -540,7 +550,7 @@ RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,t echo "MAX_JOBS: ${MAX_JOBS}" && \ /build/fa-build.sh flash-attention/hopper -FROM builder-base as builder +FROM builder-base AS builder COPY --link --from=torchaudio-builder /build/dist/ /build/dist/ COPY --link --from=torchvision-builder /build/dist/ /build/dist/ COPY --link --from=transformerengine-builder /build/dist/ /build/dist/ From 643b362d7b62b1d565494004585ea6d17b3904c8 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 29 Jan 2025 01:36:41 -0600 Subject: [PATCH 70/94] ci(torch): Update `nccl-tests` commit hash --- .github/configurations/torch-nccl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 0c84b40f..71c96ce3 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -5,4 +5,4 @@ include: vision: 0.20.0 audio: 2.5.0 nccl: 2.25.1-1 - nccl-tests-hash: 4e02d6a + nccl-tests-hash: 57fa979 From b58974c2a02fd66212566821bcd01a9197eee1d7 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 29 Jan 2025 02:06:18 -0600 Subject: [PATCH 71/94] build(torch): Filter `compute_100` build on older CUDA versions --- torch/Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/torch/Dockerfile b/torch/Dockerfile index 1994ed7a..a26b655a 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -333,6 +333,10 @@ ENV TORCH_VERSION=$BUILD_TORCH_VERSION ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a -gencode=arch=compute_100,code=[sm_100,compute_100]" +# Remove compute_100 build if NV_CUDA_LIB_VERSION doesn't match 12.[89].* +ARG BUILD_NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS}:::${NV_CUDA_LIB_VERSION}" +ARG BUILD_NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS%:::12.[89].*}" +ARG BUILD_NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS% -gencode=arch=compute_100,code=\[sm_100,compute_100\]:::*}" ENV NVCC_APPEND_FLAGS="-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BUILD_NVCC_APPEND_FLAGS}" # If the directory /opt/nccl-tests exists, @@ -366,6 +370,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch --mount=type=cache,target=/ccache \ export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ + echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ ./storage-info.sh . && \ cd pytorch && \ ../storage-info.sh . && \ From 7095c593814106093906dc6a50e30a8a2a9a6988 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 29 Jan 2025 02:26:51 -0600 Subject: [PATCH 72/94] build(torch): Switch `NVCC_APPEND_FLAGS` to not be an `ENV` directive --- torch-extras/Dockerfile | 16 ++++++++++++---- torch/Dockerfile | 24 ++++++++++++++++++------ 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 5a6c66f4..415608b4 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -106,7 +106,11 @@ COPY --chmod=755 effective_cpu_count.sh . COPY --chmod=755 scale.sh . ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=compute_90a" -ENV NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS:+$BUILD_NVCC_APPEND_FLAGS }-diag-suppress 186,177" +RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \ + case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \ + FLAGS="${FLAGS} -gencode=arch=compute_100,code=[sm_100,compute_100]" ;; \ + esac && \ + echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf ARG BUILD_MAX_JOBS @@ -114,6 +118,7 @@ FROM builder-base as deepspeed-builder ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST RUN --mount=type=bind,from=ds-kernels-downloader,source=/git/ds-kernels,target=ds-kernels/,rw \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ cd ds-kernels && \ export CUDA_ARCH_LIST="${DEEPSPEED_KERNELS_CUDA_ARCH_LIST}" && \ echo "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}" && \ @@ -146,8 +151,9 @@ ARG DS_BUILD_AIO="" ARG DEEPSPEED_VERSION SHELL ["/bin/bash", "-o", "pipefail", "-c"] -RUN python3 -m pip install -U --no-cache-dir \ - setuptools wheel pip deepspeed-kernels py-cpuinfo && \ +RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + python3 -m pip install -U --no-cache-dir \ + setuptools wheel pip py-cpuinfo && \ if python3 -m pip show torch | grep 'Version: 2\.[1-9]' > /dev/null; then \ # DeepSpeed's AIO extension is incompatible with PyTorch 2.1.x's # requirement for C++17 (as of DeepSpeed 0.10.1). @@ -198,6 +204,7 @@ RUN LIBNCCL2_VERSION=$(dpkg-query --showformat='${Version}' --show libnccl2) && # --distributed_adam, --distributed_lamb, and --group_norm aren't documented # in the Apex README, but are defined in its setup.py config. RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ python3 -m pip install -U --no-cache-dir \ packaging setuptools wheel pip && \ CUDA_MAJOR_VERSION=$(echo "${CUDA_VERSION}" | cut -d. -f1) && \ @@ -251,7 +258,8 @@ FROM builder-base as xformers-builder ARG XFORMERS_VERSION SHELL ["/bin/bash", "-o", "pipefail", "-c"] -RUN python3 -m pip install -U --no-cache-dir \ +RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + python3 -m pip install -U --no-cache-dir \ setuptools wheel pip && \ CC=$(realpath -e ./compiler) \ MAX_JOBS=1 \ diff --git a/torch/Dockerfile b/torch/Dockerfile index a26b655a..795e1172 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -332,12 +332,13 @@ ARG BUILD_TORCH_CUDA_ARCH_LIST ENV TORCH_VERSION=$BUILD_TORCH_VERSION ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST -ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a -gencode=arch=compute_100,code=[sm_100,compute_100]" -# Remove compute_100 build if NV_CUDA_LIB_VERSION doesn't match 12.[89].* -ARG BUILD_NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS}:::${NV_CUDA_LIB_VERSION}" -ARG BUILD_NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS%:::12.[89].*}" -ARG BUILD_NVCC_APPEND_FLAGS="${BUILD_NVCC_APPEND_FLAGS% -gencode=arch=compute_100,code=\[sm_100,compute_100\]:::*}" -ENV NVCC_APPEND_FLAGS="-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${BUILD_NVCC_APPEND_FLAGS:+ $BUILD_NVCC_APPEND_FLAGS}" +ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a" +# Add compute_100 build if NV_CUDA_LIB_VERSION matches 12.[89].* +RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \ + case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \ + FLAGS="${FLAGS} -gencode=arch=compute_100,code=[sm_100,compute_100]" ;; \ + esac && \ + echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf # If the directory /opt/nccl-tests exists, # the base image is assumed to be nccl-tests, @@ -370,6 +371,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch --mount=type=cache,target=/ccache \ export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ ./storage-info.sh . && \ cd pytorch && \ @@ -422,6 +424,8 @@ RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=visi --mount=type=cache,target=/ccache \ export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ cd vision && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ @@ -460,6 +464,8 @@ RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/ --mount=type=cache,target=/ccache \ export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ cd audio && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ @@ -497,6 +503,8 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \ export MAX_JOBS="${BUILD_MAX_JOBS:-$MAX_JOBS}" && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ cd TransformerEngine && \ if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \ sed -i "s/from functools import cache/from functools import lru_cache as cache/g" \ @@ -527,6 +535,8 @@ RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,tar export CC=$(realpath -e ./compiler) \ MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)}" && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ cd flash-attention && \ for EXT_DIR in $(realpath -s -e \ . \ @@ -553,6 +563,8 @@ RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,t export CC=$(realpath -e ./compiler) \ MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 10 6)}" && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ /build/fa-build.sh flash-attention/hopper FROM builder-base AS builder From 440d8441146fdfb22a9ea3237ba684a978b70573 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 29 Jan 2025 19:10:55 -0600 Subject: [PATCH 73/94] ci: Build only for `linux/amd64` again [skip ci] --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6eb3b114..2c825b36 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -109,7 +109,7 @@ jobs: labels: ${{ steps.meta.outputs.labels }} cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }} cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max - platforms: linux/amd64,linux/arm64 + platforms: linux/amd64 - name: Clear registry credentials if: always() run: | From 6bc6fb62a5dce9a7825131e4fd7f22deaab42604 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 29 Jan 2025 19:13:30 -0600 Subject: [PATCH 74/94] feat(torch): Build with PyTorch v2.6.0 --- .github/configurations/torch-base.yml | 6 +++--- .github/configurations/torch-nccl.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml index 18d5caa7..1d7cebe2 100644 --- a/.github/configurations/torch-base.yml +++ b/.github/configurations/torch-base.yml @@ -1,6 +1,6 @@ cuda: [ 12.8.0, 12.6.3, 12.4.1 ] os: [ ubuntu22.04 ] include: - - torch: 2.5.1 - vision: 0.20.0 - audio: 2.5.0 + - torch: 2.6.0 + vision: 0.21.0 + audio: 2.6.0 diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 71c96ce3..826ddf93 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,8 +1,8 @@ cuda: [ 12.8.0, 12.6.3, 12.4.1 ] os: [ ubuntu22.04 ] include: - - torch: 2.5.1 - vision: 0.20.0 - audio: 2.5.0 + - torch: 2.6.0 + vision: 0.21.0 + audio: 2.6.0 nccl: 2.25.1-1 nccl-tests-hash: 57fa979 From ba41ff96cc1648369e0a34a2bd6795749ccb417c Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 29 Jan 2025 19:41:33 -0600 Subject: [PATCH 75/94] feat(torch): Build with `flash-attn` v2.7.4.post1 --- torch/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 795e1172..441d9129 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -6,8 +6,8 @@ ARG BUILD_TORCH_VERSION="2.5.1" ARG BUILD_TORCH_VISION_VERSION="0.20.0" ARG BUILD_TORCH_AUDIO_VERSION="2.5.0" ARG BUILD_TRANSFORMERENGINE_VERSION="1.13" -ARG BUILD_FLASH_ATTN_VERSION="2.7.2.post1" -ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1" +ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1" +ARG BUILD_FLASH_ATTN_3_VERSION="2.7.4.post1" ARG BUILD_TRITON_VERSION="" ARG BUILD_TRITON="1" ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX" From 25e8a9ed55da858e1b9526fd3c8034ad8665ea27 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 29 Jan 2025 19:45:08 -0600 Subject: [PATCH 76/94] build(torch): Build both CXX11 ABI variants --- .github/configurations/torch-base.yml | 1 + .github/configurations/torch-nccl.yml | 1 + .github/workflows/torch-base.yml | 3 ++- .github/workflows/torch-nccl.yml | 3 ++- .github/workflows/torch.yml | 8 ++++++++ torch/Dockerfile | 4 ++++ 6 files changed, 18 insertions(+), 2 deletions(-) diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml index 1d7cebe2..f75b79a5 100644 --- a/.github/configurations/torch-base.yml +++ b/.github/configurations/torch-base.yml @@ -1,5 +1,6 @@ cuda: [ 12.8.0, 12.6.3, 12.4.1 ] os: [ ubuntu22.04 ] +abi: [ 1, 0 ] include: - torch: 2.6.0 vision: 0.21.0 diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 826ddf93..ec1f1f91 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,5 +1,6 @@ cuda: [ 12.8.0, 12.6.3, 12.4.1 ] os: [ ubuntu22.04 ] +abi: [ 1, 0 ] include: - torch: 2.6.0 vision: 0.21.0 diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml index 93148a65..a6fc2fb1 100644 --- a/.github/workflows/torch-base.yml +++ b/.github/workflows/torch-base.yml @@ -35,11 +35,12 @@ jobs: secrets: inherit with: image-name: ${{ inputs.image-name }} - tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }} + tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }} builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-${{ matrix.os }} base-image: nvidia/cuda:${{ matrix.cuda }}-base-${{ matrix.os }} torch-version: ${{ matrix.torch }} torchvision-version: ${{ matrix.vision }} torchaudio-version: ${{ matrix.audio }} + cxx11-abi: ${{ matrix.abi }} cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }} build-extras: true diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml index 9815639d..feae8f7b 100644 --- a/.github/workflows/torch-nccl.yml +++ b/.github/workflows/torch-nccl.yml @@ -43,11 +43,12 @@ jobs: secrets: inherit with: image-name: ${{ inputs.image-name }} - tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.cuda, matrix.os, matrix.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }} + tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.cuda, matrix.os, matrix.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }} builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} torch-version: ${{ matrix.torch }} torchvision-version: ${{ matrix.vision }} torchaudio-version: ${{ matrix.audio }} + cxx11-abi: ${{ matrix.abi }} cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }} build-extras: true diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml index 36bdcc6d..b0eb4634 100644 --- a/.github/workflows/torch.yml +++ b/.github/workflows/torch.yml @@ -22,6 +22,9 @@ on: triton-version: required: false type: string + cxx11-abi: + required: false + type: string cuda-arch-support: required: false type: string @@ -67,6 +70,10 @@ on: required: false description: "Tagged version number from openai/triton to build" type: string + cxx11-abi: + required: false + description: "Build with the CXX11 ABI (1 = enable, 0 = disable)" + type: string cuda-arch-support: required: false description: "Space-separated list of CUDA architectures to support" @@ -99,6 +106,7 @@ jobs: BUILD_TORCH_VERSION=${{ inputs.torch-version }} BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }} BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }} + ${{ inputs.cxx11-abi && format('BUILD_CXX11_ABI={0}', inputs.cxx11-abi) || '' }} ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }} ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }} build-extras: diff --git a/torch/Dockerfile b/torch/Dockerfile index 441d9129..5ca9715d 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -366,6 +366,7 @@ RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \ # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271). # Without WITH_BLAS, it would detect the BLAS implementation as # BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either. +ARG BUILD_CXX11_ABI="" SHELL ["/bin/bash", "-eo", "pipefail", "-c"] RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \ --mount=type=cache,target=/ccache \ @@ -373,6 +374,9 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch echo "MAX_JOBS: ${MAX_JOBS}" && \ export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ + if [ -n "${BUILD_CXX11_ABI}" ]; then \ + export _GLIBCXX_USE_CXX11_ABI="${BUILD_CXX11_ABI}"; \ + fi && \ ./storage-info.sh . && \ cd pytorch && \ ../storage-info.sh . && \ From 77574c2f694148d48ef49cca069d2835364f82ed Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 29 Jan 2025 19:49:34 -0600 Subject: [PATCH 77/94] ci(torch): Remove parameterization of `TORCH_CUDA_ARCH_LIST` This is to fit within the 10-parameter limit for reusable workflows. --- .github/workflows/torch.yml | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml index b0eb4634..6538dff6 100644 --- a/.github/workflows/torch.yml +++ b/.github/workflows/torch.yml @@ -25,10 +25,6 @@ on: cxx11-abi: required: false type: string - cuda-arch-support: - required: false - type: string - default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX" image-name: required: false type: string @@ -74,11 +70,6 @@ on: required: false description: "Build with the CXX11 ABI (1 = enable, 0 = disable)" type: string - cuda-arch-support: - required: false - description: "Space-separated list of CUDA architectures to support" - type: string - default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX" image-name: required: false description: "Custom name under which to publish the resulting container" @@ -106,8 +97,8 @@ jobs: BUILD_TORCH_VERSION=${{ inputs.torch-version }} BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }} BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }} + BUILD_TORCH_CUDA_ARCH_LIST=7.0 7.5 8.0 8.6 8.9 9.0+PTX ${{ inputs.cxx11-abi && format('BUILD_CXX11_ABI={0}', inputs.cxx11-abi) || '' }} - ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }} ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }} build-extras: name: Build torch-extras From 486738eb35217f03e08faaa320dec92b8e98d379 Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 30 Jan 2025 11:19:26 -0600 Subject: [PATCH 78/94] build(torch): Downgrade `flash-attn` 3 to the 2.7.2.post1 tag 2.7.4.post1 has build conflicts flash-attn 2 and with compute_100 --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 5ca9715d..4f5862d0 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -7,7 +7,7 @@ ARG BUILD_TORCH_VISION_VERSION="0.20.0" ARG BUILD_TORCH_AUDIO_VERSION="2.5.0" ARG BUILD_TRANSFORMERENGINE_VERSION="1.13" ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1" -ARG BUILD_FLASH_ATTN_3_VERSION="2.7.4.post1" +ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1" ARG BUILD_TRITON_VERSION="" ARG BUILD_TRITON="1" ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX" From 2da31d97755e049ad4c4bee31a7ce30b1c0cd4dc Mon Sep 17 00:00:00 2001 From: Eta Date: Sat, 1 Feb 2025 12:18:22 -0600 Subject: [PATCH 79/94] ci: Re-enable ARM64 builds again --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2c825b36..603b0b97 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -34,7 +34,7 @@ jobs: build: name: Build Images runs-on: [ cw ] - container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.6.0' + container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0' defaults: run: shell: bash @@ -109,7 +109,7 @@ jobs: labels: ${{ steps.meta.outputs.labels }} cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }} cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max - platforms: linux/amd64 + platforms: linux/amd64,linux/arm64 - name: Clear registry credentials if: always() run: | From ac7f89d73d623500396ab30458fa09c598d02f60 Mon Sep 17 00:00:00 2001 From: Eta Date: Sat, 1 Feb 2025 12:19:55 -0600 Subject: [PATCH 80/94] ci(torch): Increase `torch` image build job timeout --- .github/workflows/torch.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml index 6538dff6..4b27af68 100644 --- a/.github/workflows/torch.yml +++ b/.github/workflows/torch.yml @@ -82,6 +82,7 @@ on: jobs: build: + timeout-minutes: 960 name: Build torch uses: ./.github/workflows/build.yml secrets: inherit From f9ffd6f282ca266d2e26f984fa1181cafeabc739 Mon Sep 17 00:00:00 2001 From: Eta Date: Sat, 1 Feb 2025 12:34:23 -0600 Subject: [PATCH 81/94] ci(torch): Increase all job timeouts [skip ci] --- .github/workflows/build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 603b0b97..3adc93e5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -35,6 +35,7 @@ jobs: name: Build Images runs-on: [ cw ] container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0' + timeout-minutes: 960 defaults: run: shell: bash From fb567b854d07c605b654d3afd18e5178bb62e4e0 Mon Sep 17 00:00:00 2001 From: Eta Date: Sat, 1 Feb 2025 12:34:57 -0600 Subject: [PATCH 82/94] ci(torch): Remove `torch`-specific job timeout override --- .github/workflows/torch.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml index 4b27af68..6538dff6 100644 --- a/.github/workflows/torch.yml +++ b/.github/workflows/torch.yml @@ -82,7 +82,6 @@ on: jobs: build: - timeout-minutes: 960 name: Build torch uses: ./.github/workflows/build.yml secrets: inherit From 0bd89968d09542ab328ea3e0ab54962fdd6aa223 Mon Sep 17 00:00:00 2001 From: Eta Date: Sun, 2 Feb 2025 21:29:06 -0600 Subject: [PATCH 83/94] build(torch-extras): Specify DeepSpeed build flags better [skip ci] --- torch-extras/Dockerfile | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 415608b4..55e58fa7 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -128,25 +128,25 @@ RUN --mount=type=bind,from=ds-kernels-downloader,source=/git/ds-kernels,target=d # DeepSpeed build flags # See: https://www.deepspeed.ai/tutorials/advanced-install -ARG DS_BUILD_OPS="1" +ARG DS_BUILD_OPS="0" ARG DS_BUILD_CCL_COMM="0" -ARG DS_BUILD_CPU_ADAM="" -ARG DS_BUILD_CPU_LION="" +ARG DS_BUILD_CPU_ADAM="1" +ARG DS_BUILD_CPU_LION="1" # Requires CUTLASS ARG DS_BUILD_EVOFORMER_ATTN="0" -ARG DS_BUILD_FUSED_ADAM="" -ARG DS_BUILD_FUSED_LION="" -ARG DS_BUILD_CPU_ADAGRAD="" -ARG DS_BUILD_FUSED_LAMB="" -ARG DS_BUILD_QUANTIZER="" -ARG DS_BUILD_RANDOM_LTD="" +ARG DS_BUILD_FUSED_ADAM="1" +ARG DS_BUILD_FUSED_LION="1" +ARG DS_BUILD_CPU_ADAGRAD="1" +ARG DS_BUILD_FUSED_LAMB="1" +ARG DS_BUILD_QUANTIZER="1" +ARG DS_BUILD_RANDOM_LTD="1" # sparse_attn has issues with PyTorch >= 2.0.0 as of DeepSpeed 0.9.4 ARG DS_BUILD_SPARSE_ATTN="0" -ARG DS_BUILD_TRANSFORMER="" -ARG DS_BUILD_TRANSFORMER_INFERENCE="" -ARG DS_BUILD_STOCHASTIC_TRANSFORMER="" -ARG DS_BUILD_UTILS="" -ARG DS_BUILD_AIO="" +ARG DS_BUILD_TRANSFORMER="1" +ARG DS_BUILD_TRANSFORMER_INFERENCE="1" +ARG DS_BUILD_STOCHASTIC_TRANSFORMER="1" +ARG DS_BUILD_UTILS="1" +ARG DS_BUILD_AIO="1" ARG DEEPSPEED_VERSION @@ -185,8 +185,9 @@ RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ } && \ CC=$(realpath -e ./compiler) \ MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)}" \ + DS_ACCELERATOR='cuda' \ python3 -m pip wheel -w /wheels \ - --no-cache-dir --no-build-isolation --no-deps \ + --no-cache-dir --no-build-isolation --no-deps -v \ deepspeed==${DEEPSPEED_VERSION} && \ rm ./* SHELL ["/bin/sh", "-c"] From 386fabe4d6e3af40fe9fc6c72e62d2c4af0545bc Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 3 Feb 2025 01:36:39 -0600 Subject: [PATCH 84/94] build(torch-extras): Remove `DS_ACCELERATOR` specification [skip ci] --- torch-extras/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 55e58fa7..fbd5c29f 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -185,7 +185,6 @@ RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ } && \ CC=$(realpath -e ./compiler) \ MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)}" \ - DS_ACCELERATOR='cuda' \ python3 -m pip wheel -w /wheels \ --no-cache-dir --no-build-isolation --no-deps -v \ deepspeed==${DEEPSPEED_VERSION} && \ From 45dd5a0a956f9032e16ea63ee7516a93de2f1151 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 7 Feb 2025 00:45:35 -0600 Subject: [PATCH 85/94] build(torch): Enable less-hacky 10.0 arch support in PyTorch --- torch/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 4f5862d0..421b2e16 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -10,8 +10,8 @@ ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1" ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1" ARG BUILD_TRITON_VERSION="" ARG BUILD_TRITON="1" -ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX" -ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST="70;80;89;90" +ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0 10.0+PTX" +ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST="70;80;89;90;100" # 8.7 is supported in the PyTorch main branch, but not 2.0.0 From 90d178ba5ed7003f361963f891847247163a6bf7 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 7 Feb 2025 00:56:57 -0600 Subject: [PATCH 86/94] feat(sglang): Add `sglang` image --- .github/workflows/sglang.yml | 30 +++++++ sglang/Dockerfile | 29 +++++++ sglang/build.bash | 149 +++++++++++++++++++++++++++++++++++ sglang/install.bash | 32 ++++++++ 4 files changed, 240 insertions(+) create mode 100644 .github/workflows/sglang.yml create mode 100644 sglang/Dockerfile create mode 100644 sglang/build.bash create mode 100644 sglang/install.bash diff --git a/.github/workflows/sglang.yml b/.github/workflows/sglang.yml new file mode 100644 index 00000000..a851ecba --- /dev/null +++ b/.github/workflows/sglang.yml @@ -0,0 +1,30 @@ +on: + workflow_dispatch: + inputs: + tag: + description: 'Tag for the build' + required: true + base-image: + description: 'Base image from which to build' + required: true + builder-image: + description: 'Image to use to compile wheels, if different from the base image' + required: false + push: + paths: + - "sglang/**" + - ".github/workflows/sglang.yml" + - ".github/workflows/build.yml" + + +jobs: + build: + uses: ./.github/workflows/build.yml + secrets: inherit + with: + image-name: sglang + folder: sglang + tag-suffix: ${{ inputs.tag || '386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1' }} + build-args: | + BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:es-actions-386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1'}} + ${{ inputs.base-image && 'BASE_IMAGE=' }}${{ inputs.base-image}} diff --git a/sglang/Dockerfile b/sglang/Dockerfile new file mode 100644 index 00000000..9adb87b0 --- /dev/null +++ b/sglang/Dockerfile @@ -0,0 +1,29 @@ +# syntax=docker/dockerfile:1.2 +ARG BASE_IMAGE +ARG BUILDER_IMAGE="${BASE_IMAGE}" + +FROM ${BUILDER_IMAGE} AS builder + +ARG BUILD_TORCH_CUDA_ARCH_LIST='8.0 8.6 8.9 9.0 10.0+PTX' + +ARG FLASHINFER_COMMIT='c04755e21f4d6fb7813c703f2b00a7ef012be9b8' +ARG CUTLASS_COMMIT='b78588d1630aa6643bf021613717bafb705df4ef' +ARG VLLM_COMMIT='5095e966069b9e65b7c4c63427e06cebacaad0a0' +ARG SGLANG_COMMIT='4b6f62e2bc52a528551e9a21e7b0a4945c6115bb' +ARG DECORD_COMMIT='d2e56190286ae394032a8141885f76d5372bd44b' +# Building Triton is not currently enabled, +# but this is the commit that would be used if it were +ARG TRITON_COMMIT='1e0e51c4aeb3e1beea000da5d0e494f8b9ac40dd' + +WORKDIR /build +COPY build.bash /build/ +RUN mkdir /wheels && \ + bash build.bash -a "${BUILD_TORCH_CUDA_ARCH_LIST}" && \ + rm -rf /build/* +COPY install.bash /wheels/ + +FROM ${BASE_IMAGE} +RUN --mount=type=bind,from=builder,source=/wheels,target=/wheels \ + cd /wheels && \ + bash install.bash +RUN rmdir /wheels diff --git a/sglang/build.bash b/sglang/build.bash new file mode 100644 index 00000000..c0e0a7ad --- /dev/null +++ b/sglang/build.bash @@ -0,0 +1,149 @@ +#!/bin/bash +set -xeo pipefail + +TORCH_CUDA_ARCH_LIST='' +FILTER_ARCHES='' +BUILD_TRITON='' + +while getopts 'a:ft' OPT; do + case "${OPT}" in + a) TORCH_CUDA_ARCH_LIST="${OPTARG}" ;; + f) FILTER_ARCHES='1' ;; + t) BUILD_TRITON='1' ;; + *) exit 92 ;; + esac +done + +export NVCC_APPEND_FLAGS='-gencode=arch=compute_100,code=[sm_100,compute_100] -gencode=arch=compute_100a,code=sm_100a --diag-suppress 174' +export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0 10.0+PTX}" + +mkdir -p /wheels/logs + +_BUILD() { python3 -m build -w -n -v -o /wheels "${1:-.}"; } +_LOG() { tee -a "/wheels/logs/${1:?}"; } +_CONSTRAINTS="$(python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p')" +_PIP_INSTALL() { + python3 -m pip install --no-cache-dir \ + --constraint=/dev/stdin <<< "${_CONSTRAINTS}" \ + "$@" +} + +_PIP_INSTALL -U pip setuptools wheel build pybind11 ninja cmake + +# triton (not compatible with torch 2.6) +if [ "${BUILD_TRITON}" = 1 ]; then ( + : "${TRITON_COMMIT:?}" + echo 'Building triton-lang/triton' + git clone --recursive --filter=blob:none https://github.com/triton-lang/triton + cd triton + git checkout "${TRITON_COMMIT}" + _BUILD python |& _LOG triton.log +); fi + +# flashinfer +: "${FLASHINFER_COMMIT:?}" +: "${CUTLASS_COMMIT:?}" +( +echo 'Building flashinfer-ai/flashinfer' +git clone --recursive --filter=blob:none https://github.com/flashinfer-ai/flashinfer +cd flashinfer +git checkout "${FLASHINFER_COMMIT}" +sed -i 's/name = "flashinfer-python"/name = "flashinfer"/' pyproject.toml +git -C 3rdparty/cutlass checkout "${CUTLASS_COMMIT}" +_PIP_INSTALL -U optree +NVCC_APPEND_FLAGS="${NVCC_APPEND_FLAGS:+$NVCC_APPEND_FLAGS } --diag-suppress 20281,174" \ + FLASHINFER_ENABLE_AOT=1 _BUILD . |& _LOG flashinfer.log +) + +# Setup cutlass repo for vLLM to use +git clone --recursive --filter=blob:none https://github.com/NVIDIA/cutlass +git -C cutlass checkout "${CUTLASS_COMMIT}" + +# vLLM +: "${VLLM_COMMIT:?}" +( +echo 'Building vllm-project/vllm' +export VLLM_CUTLASS_SRC_DIR="${PWD}/cutlass" +test -d "${VLLM_CUTLASS_SRC_DIR}" +git clone --recursive --filter=blob:none https://github.com/vllm-project/vllm +cd vllm +git checkout "${VLLM_COMMIT}" +# For lsmod +apt-get -qq update && apt-get -qq install kmod +python3 use_existing_torch.py +_PIP_INSTALL -r requirements-build.txt +_BUILD . |& _LOG vllm.log +) + +# sglang +: "${SGLANG_COMMIT:?}" +( +echo 'Building sglang' +git clone --recursive --filter=blob:none https://github.com/sgl-project/sglang +cd sglang +git checkout "${SGLANG_COMMIT}" +( +cd sgl-kernel +git -C 3rdparty/cutlass checkout "${CUTLASS_COMMIT}" +git -C 3rdparty/flashinfer/3rdparty/cutlass checkout "${CUTLASS_COMMIT}" + +ARCH_TRIPLE="$(gcc -print-multiarch)" +LIB_DIR="/usr/lib/${ARCH_TRIPLE:?}" +test -d "${LIB_DIR:?}" +PYTHON_API_VER="$( + python3 --version | sed -En 's@Python ([0-9])\.([0-9]+)\..*@cp\1\2@p' +)" +ARCH_FILTER=() +if [ "${FILTER_ARCHES}" = 1 ]; then + ARCH_FILTER=(-e 's@"-gencode=arch=compute_[78][0-9],code=sm_[78][0-9]",@#\0@') +fi + +sed -Ei \ + "${ARCH_FILTER[@]}" \ + -e 's@/usr/lib/x86_64-linux-gnu@'"${LIB_DIR}"'@' \ + -e 's@(\s+)(\w.+manylinux2014_x86_64.+)@\1pass # \2@' \ + -e 's@\{"py_limited_api": "cp39"}@{"py_limited_api": "'"${PYTHON_API_VER:-cp310}"'"}@' \ + setup.py +SGL_KERNEL_ENABLE_BF16=1 SGL_KERNEL_ENABLE_FP8=1 SGL_KERNEL_ENABLE_SM90A=1 \ + _BUILD . |& _LOG sglang.log +) +_BUILD python |& _LOG sglang.log +) + +# decord and xgrammar aren't available on PyPI for ARM64 + +if [ ! "$(uname -m)" = 'x86_64' ]; then + # xgrammar (for sglang) + ( + git clone --recursive --filter=blob:none -b v0.1.11 https://github.com/mlc-ai/xgrammar && \ + cd xgrammar + ( + mkdir build && cd build + cmake -S.. -B. -DCMAKE_BUILD_TYPE=Release -GNinja |& _LOG xgrammar.log + cmake --build . |& _LOG xgrammar.log + ) + _BUILD python |& _LOG xgrammar.log + ) + + # decord (for sglang) + : "${DECORD_COMMIT:?}" + ( + apt-get -qq update && apt-get -q install --no-install-recommends \ + build-essential python3-dev python3-setuptools \ + make cmake ffmpeg \ + libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev + git clone --recursive --filter=blob:none https://github.com/dmlc/decord + cd decord + git checkout "${DECORD_COMMIT}" + ( + mkdir build && cd build + cmake -S.. -B. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release -GNinja |& _LOG decord.log + cmake --build . |& _LOG decord.log + cp libdecord.so /wheels/libdecord.so + ) + cd python + _BUILD . |& _LOG decord.log + ) +fi + +apt-get clean diff --git a/sglang/install.bash b/sglang/install.bash new file mode 100644 index 00000000..dfd39370 --- /dev/null +++ b/sglang/install.bash @@ -0,0 +1,32 @@ +#!/bin/bash +set -xeo pipefail + +_CONSTRAINTS="$( + python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p' +)" +_PIP_INSTALL() { + python3 -m pip install --no-cache-dir \ + --constraint=/dev/stdin <<< "${_CONSTRAINTS}" \ + "$@" +} + +_PIP_INSTALL /wheels/*.whl +if [ -x /wheels/libdecord.so ]; then + apt-get -qq update && apt-get -q install --no-install-recommends \ + libavfilter7 libavformat58 && \ + apt-get clean + cp /wheels/libdecord.so /usr/local/lib/ && ldconfig +fi + +SGLANG_EXTRA_PIP_DEPENDENCIES=() +if [ "$(uname -m)" = 'x86_64' ]; then + SGLANG_EXTRA_PIP_DEPENDENCIES=('decord' 'xgrammar>=0.1.10') +fi +_PIP_INSTALL \ + 'aiohttp' 'fastapi' \ + 'hf_transfer' 'huggingface_hub' 'interegular' 'modelscope' \ + 'orjson' 'packaging' 'pillow' 'prometheus-client>=0.20.0' \ + 'psutil' 'pydantic' 'python-multipart' 'pyzmq>=25.1.2' \ + 'torchao>=0.7.0' 'uvicorn' 'uvloop' \ + 'cuda-python' 'outlines>=0.0.44,<0.1.0' \ + "${SGLANG_EXTRA_PIP_DEPENDENCIES[@]}" From 855d2f3d893e598b913bfc1ba124da6e8efc7dee Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 7 Feb 2025 01:14:02 -0600 Subject: [PATCH 87/94] build(sglang): Use `USE_CUDNN` and `USE_CUSPARSELT` flags in vLLM build --- sglang/build.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sglang/build.bash b/sglang/build.bash index c0e0a7ad..e58ff0fd 100644 --- a/sglang/build.bash +++ b/sglang/build.bash @@ -72,7 +72,7 @@ git checkout "${VLLM_COMMIT}" apt-get -qq update && apt-get -qq install kmod python3 use_existing_torch.py _PIP_INSTALL -r requirements-build.txt -_BUILD . |& _LOG vllm.log +USE_CUDNN=1 USE_CUSPARSELT=1 _BUILD . |& _LOG vllm.log ) # sglang From dc70ca8e2ee57b4654e0779166c82bea7087e51b Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 7 Feb 2025 10:00:30 -0600 Subject: [PATCH 88/94] fix(sglang): Remove extraneous `rmdir` build step --- sglang/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/sglang/Dockerfile b/sglang/Dockerfile index 9adb87b0..2103ca20 100644 --- a/sglang/Dockerfile +++ b/sglang/Dockerfile @@ -26,4 +26,3 @@ FROM ${BASE_IMAGE} RUN --mount=type=bind,from=builder,source=/wheels,target=/wheels \ cd /wheels && \ bash install.bash -RUN rmdir /wheels From f113d38bcc90da52b6da06ae395b9f07f520fd79 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 7 Feb 2025 12:40:25 -0600 Subject: [PATCH 89/94] fix(sglang): Skip `apt` prompts --- sglang/build.bash | 5 +++-- sglang/install.bash | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sglang/build.bash b/sglang/build.bash index e58ff0fd..d72d7a37 100644 --- a/sglang/build.bash +++ b/sglang/build.bash @@ -1,5 +1,6 @@ #!/bin/bash set -xeo pipefail +export DEBIAN_FRONTEND=noninteractive TORCH_CUDA_ARCH_LIST='' FILTER_ARCHES='' @@ -69,7 +70,7 @@ git clone --recursive --filter=blob:none https://github.com/vllm-project/vllm cd vllm git checkout "${VLLM_COMMIT}" # For lsmod -apt-get -qq update && apt-get -qq install kmod +apt-get -qq update && apt-get -qq install --no-install-recommends -y kmod python3 use_existing_torch.py _PIP_INSTALL -r requirements-build.txt USE_CUDNN=1 USE_CUSPARSELT=1 _BUILD . |& _LOG vllm.log @@ -128,7 +129,7 @@ if [ ! "$(uname -m)" = 'x86_64' ]; then # decord (for sglang) : "${DECORD_COMMIT:?}" ( - apt-get -qq update && apt-get -q install --no-install-recommends \ + apt-get -qq update && apt-get -q install --no-install-recommends -y \ build-essential python3-dev python3-setuptools \ make cmake ffmpeg \ libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev diff --git a/sglang/install.bash b/sglang/install.bash index dfd39370..07c23b6b 100644 --- a/sglang/install.bash +++ b/sglang/install.bash @@ -1,5 +1,6 @@ #!/bin/bash set -xeo pipefail +export DEBIAN_FRONTEND=noninteractive _CONSTRAINTS="$( python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p' @@ -12,7 +13,7 @@ _PIP_INSTALL() { _PIP_INSTALL /wheels/*.whl if [ -x /wheels/libdecord.so ]; then - apt-get -qq update && apt-get -q install --no-install-recommends \ + apt-get -qq update && apt-get -q install --no-install-recommends -y \ libavfilter7 libavformat58 && \ apt-get clean cp /wheels/libdecord.so /usr/local/lib/ && ldconfig From f63ddbebf21c2a658482bfca96ce68cc3d101687 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 14 Feb 2025 14:30:40 -0600 Subject: [PATCH 90/94] ci(torch-nightly): Update runner image version --- .github/workflows/torch-nightly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index fb51869f..4693b963 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -20,7 +20,7 @@ jobs: name: Get Nightly Info runs-on: [ cw ] - container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0' + container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0' defaults: run: shell: bash From 868a61122f8c5b6c8e0d063ee568a5fa8b7d9808 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 14 Feb 2025 14:31:11 -0600 Subject: [PATCH 91/94] ci: Parameterize build platforms [skip ci] --- .github/workflows/build.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3adc93e5..83707c88 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,6 +19,11 @@ on: required: false description: "Optional sub-key to append to the image name for build layer caching" type: string + platforms: + required: false + description: "Platforms for which to build (default: linux/amd64,linux/arm64)" + type: string + default: linux/amd64,linux/arm64 outputs: outcome: description: "The outcome of the build" @@ -110,7 +115,7 @@ jobs: labels: ${{ steps.meta.outputs.labels }} cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }} cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max - platforms: linux/amd64,linux/arm64 + platforms: ${{ inputs.platforms }} - name: Clear registry credentials if: always() run: | From 0e4411609d924aeddf1a997020191b1f689623d1 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 14 Feb 2025 14:34:38 -0600 Subject: [PATCH 92/94] fix(torch): Filter 10.0 arch builds on unsupported CUDA versions again The previous method didn't work when 10.0 was included in the BUILD_TORCH_CUDA_ARCH_LIST build argument, so this uses shell parameter expansion hackery to get around that. This also keeps the previous logic, but switches it to force sm_100a builds on supported CUDA versions. --- torch-extras/Dockerfile | 2 +- torch/Dockerfile | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index fbd5c29f..51346e0b 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -108,7 +108,7 @@ COPY --chmod=755 scale.sh . ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=compute_90a" RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \ case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \ - FLAGS="${FLAGS} -gencode=arch=compute_100,code=[sm_100,compute_100]" ;; \ + FLAGS="${FLAGS} -gencode=arch=compute_100a,code=sm_100a" ;; \ esac && \ echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf ARG BUILD_MAX_JOBS diff --git a/torch/Dockerfile b/torch/Dockerfile index 421b2e16..5db5c4aa 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -1,4 +1,4 @@ -# syntax=docker/dockerfile:1.4 +# syntax=docker/dockerfile:1.7 ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.8.0-devel-ubuntu22.04" ARG FINAL_BASE_IMAGE="nvidia/cuda:12.8.0-base-ubuntu22.04" @@ -330,13 +330,17 @@ RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,r ARG BUILD_TORCH_VERSION ARG BUILD_TORCH_CUDA_ARCH_LIST ENV TORCH_VERSION=$BUILD_TORCH_VERSION -ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST +# Filter out the 10.0 arch on CUDA versions != 12.8 +ENV TORCH_CUDA_ARCH_LIST="${CUDA_VERSION##12.8.*}||${BUILD_TORCH_CUDA_ARCH_LIST/ 10.0/}||${BUILD_TORCH_CUDA_ARCH_LIST}" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#||*||}" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST%||*}" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}" ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a" -# Add compute_100 build if NV_CUDA_LIB_VERSION matches 12.[89].* +# Add sm_100a build if NV_CUDA_LIB_VERSION matches 12.[89].* RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \ case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \ - FLAGS="${FLAGS} -gencode=arch=compute_100,code=[sm_100,compute_100]" ;; \ + FLAGS="${FLAGS} -gencode=arch=compute_100a,code=sm_100a" ;; \ esac && \ echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf @@ -629,7 +633,11 @@ ARG BUILD_TORCH_CUDA_ARCH_LIST ENV TORCH_VERSION=$BUILD_TORCH_VERSION ENV TORCH_VISION_VERSION=$BUILD_TORCH_VISION_VERSION ENV TORCH_AUDIO_VERSION=$BUILD_TORCH_AUDIO_VERSION -ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST +# Filter out the 10.0 arch on CUDA versions != 12.8 +ENV TORCH_CUDA_ARCH_LIST="${CUDA_VERSION##12.8.*}||${BUILD_TORCH_CUDA_ARCH_LIST/ 10.0/}||${BUILD_TORCH_CUDA_ARCH_LIST}" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#||*||}" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST%||*}" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}" COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh # - libnvjitlink-X-Y only exists for CUDA versions >= 12-0. From 2f37df6dfa8a8aab07e18eb2ef5b54e40267a1b5 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 14 Feb 2025 15:37:20 -0600 Subject: [PATCH 93/94] fix(torch): Filter 10.0 arch builds in TransformerEngine build --- torch/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch/Dockerfile b/torch/Dockerfile index 5db5c4aa..97f0f759 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -513,6 +513,9 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE echo "MAX_JOBS: ${MAX_JOBS}" && \ export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ + case "${CUDA_VERSION}" in 12.[0123456].*) \ + export NVTE_CUDA_ARCHS="${NVTE_CUDA_ARCHS%;100*}" ;; \ + esac && \ cd TransformerEngine && \ if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \ sed -i "s/from functools import cache/from functools import lru_cache as cache/g" \ From 68fbfd11691200e55d920709b9a4802b639b0c29 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 14 Feb 2025 16:10:12 -0600 Subject: [PATCH 94/94] ci(torch): Rework logic for passing various build arguments --- .github/workflows/torch-base.yml | 2 +- .github/workflows/torch-nccl.yml | 2 +- .github/workflows/torch-nightly.yml | 8 ++++---- .github/workflows/torch.yml | 17 ++++------------- torch/Dockerfile | 2 +- 5 files changed, 11 insertions(+), 20 deletions(-) diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml index a6fc2fb1..b93fbbae 100644 --- a/.github/workflows/torch-base.yml +++ b/.github/workflows/torch-base.yml @@ -41,6 +41,6 @@ jobs: torch-version: ${{ matrix.torch }} torchvision-version: ${{ matrix.vision }} torchaudio-version: ${{ matrix.audio }} - cxx11-abi: ${{ matrix.abi }} + additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }} cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }} build-extras: true diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml index feae8f7b..ede0fdf0 100644 --- a/.github/workflows/torch-nccl.yml +++ b/.github/workflows/torch-nccl.yml @@ -49,6 +49,6 @@ jobs: torch-version: ${{ matrix.torch }} torchvision-version: ${{ matrix.vision }} torchaudio-version: ${{ matrix.audio }} - cxx11-abi: ${{ matrix.abi }} + additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }} cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }} build-extras: true diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 4693b963..139d23d9 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -93,13 +93,13 @@ jobs: uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-base.yml - filter: 'del(.include) | .exclude |= . + [{"os": "ubuntu20.04"}]' + filter: 'del(.include) | .exclude |= . + [{"abi": "0"}]' get-nccl-config: name: Get torch:nccl Config uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-nccl.yml - filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"os": "ubuntu20.04"}]' + filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"abi": "0"}]' build-base: name: Build Nightly torch:base @@ -119,7 +119,7 @@ jobs: torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }} torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} - triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }} + additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }} cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }} build-extras: true build-nccl: @@ -140,6 +140,6 @@ jobs: torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }} torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} - triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }} + additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }} cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }} build-extras: true diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml index 6538dff6..938b4306 100644 --- a/.github/workflows/torch.yml +++ b/.github/workflows/torch.yml @@ -19,10 +19,7 @@ on: torchaudio-version: required: true type: string - triton-version: - required: false - type: string - cxx11-abi: + additional-build-args: required: false type: string image-name: @@ -62,13 +59,9 @@ on: required: true description: "Tagged version number from pytorch/audio to build" type: string - triton-version: - required: false - description: "Tagged version number from openai/triton to build" - type: string - cxx11-abi: + additional-build-args: required: false - description: "Build with the CXX11 ABI (1 = enable, 0 = disable)" + description: "Further --build-arg parameters for the build" type: string image-name: required: false @@ -97,9 +90,7 @@ jobs: BUILD_TORCH_VERSION=${{ inputs.torch-version }} BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }} BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }} - BUILD_TORCH_CUDA_ARCH_LIST=7.0 7.5 8.0 8.6 8.9 9.0+PTX - ${{ inputs.cxx11-abi && format('BUILD_CXX11_ABI={0}', inputs.cxx11-abi) || '' }} - ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }} + ${{ inputs.additional-build-args }} build-extras: name: Build torch-extras if: inputs.build-extras diff --git a/torch/Dockerfile b/torch/Dockerfile index 97f0f759..e070232a 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -10,7 +10,7 @@ ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1" ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1" ARG BUILD_TRITON_VERSION="" ARG BUILD_TRITON="1" -ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0 10.0+PTX" +ARG BUILD_TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0 10.0+PTX" ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST="70;80;89;90;100" # 8.7 is supported in the PyTorch main branch, but not 2.0.0