diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml index 761f2b6b..f75b79a5 100644 --- a/.github/configurations/torch-base.yml +++ b/.github/configurations/torch-base.yml @@ -1,6 +1,7 @@ -cuda: [ 12.6.1, 12.4.1, 12.2.2 ] -os: [ ubuntu22.04, ubuntu20.04 ] +cuda: [ 12.8.0, 12.6.3, 12.4.1 ] +os: [ ubuntu22.04 ] +abi: [ 1, 0 ] include: - - torch: 2.5.0 - vision: 0.20.0 - audio: 2.5.0 + - torch: 2.6.0 + vision: 0.21.0 + audio: 2.6.0 diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 6bd5f029..ec1f1f91 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,37 +1,9 @@ -image: - # Ubuntu 22.04 - - cuda: 12.6.1 - cudnn: cudnn - os: ubuntu22.04 - nccl: 2.23.4-1 - nccl-tests-hash: 2ff05b2 - - cuda: 12.4.1 - cudnn: cudnn - os: ubuntu22.04 - nccl: 2.23.4-1 - nccl-tests-hash: 2ff05b2 - - cuda: 12.2.2 - cudnn: cudnn8 - os: ubuntu22.04 - nccl: 2.23.4-1 - nccl-tests-hash: 2ff05b2 - # Ubuntu 20.04 - - cuda: 12.6.1 - cudnn: cudnn - os: ubuntu20.04 - nccl: 2.23.4-1 - nccl-tests-hash: 2ff05b2 - - cuda: 12.4.1 - cudnn: cudnn - os: ubuntu20.04 - nccl: 2.23.4-1 - nccl-tests-hash: 2ff05b2 - - cuda: 12.2.2 - cudnn: cudnn8 - os: ubuntu20.04 - nccl: 2.21.5-1 - nccl-tests-hash: 2ff05b2 +cuda: [ 12.8.0, 12.6.3, 12.4.1 ] +os: [ ubuntu22.04 ] +abi: [ 1, 0 ] include: - - torch: 2.5.0 - vision: 0.20.0 - audio: 2.5.0 + - torch: 2.6.0 + vision: 0.21.0 + audio: 2.6.0 + nccl: 2.25.1-1 + nccl-tests-hash: 57fa979 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index cbb09fdc..83707c88 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,6 +19,11 @@ on: required: false description: "Optional sub-key to append to the image name for build layer caching" type: string + platforms: + required: false + description: "Platforms for which to build (default: linux/amd64,linux/arm64)" + type: string + default: linux/amd64,linux/arm64 outputs: outcome: description: "The outcome of the build" @@ -33,26 +38,42 @@ on: jobs: build: name: Build Images - runs-on: [ self-hosted, Linux ] + runs-on: [ cw ] + container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0' + timeout-minutes: 960 + defaults: + run: + shell: bash outputs: outcome: ${{ steps.docker-build.outcome }} tags: ${{ steps.meta.outputs.tags }} version: ${{ steps.meta.outputs.version }} steps: - - uses: actions/checkout@v3 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2.2.1 - - name: Login to GitHub container registry - uses: docker/login-action@v2.2.0 + - uses: actions/checkout@v4 + - name: Fetch BuildKit Client Certs + uses: dopplerhq/secrets-fetch-action@v1.2.0 + id: client-certs with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Login to DockerHub container registry - uses: docker/login-action@v2.2.0 + doppler-token: ${{ secrets.ORG_BUILDKIT_CLIENT_TOKEN }} + doppler-project: ${{ secrets.BUILDKIT_CONSUMER_DOPPLER_PROJECT }} + doppler-config: prod + inject-env-vars: false + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3.7.1 with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} + driver: remote + endpoint: ${{ secrets.BUILDKIT_CONSUMER_AMD64_ENDPOINT }} + platforms: linux/amd64 + append: | + - endpoint: ${{ secrets.BUILDKIT_CONSUMER_ARM64_ENDPOINT }} + platforms: linux/arm64 + env: + BUILDER_NODE_0_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }} + BUILDER_NODE_0_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }} + BUILDER_NODE_0_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }} + BUILDER_NODE_1_AUTH_TLS_CACERT: ${{ steps.client-certs.outputs.TLS_CACERT }} + BUILDER_NODE_1_AUTH_TLS_CERT: ${{ steps.client-certs.outputs.TLS_CERT }} + BUILDER_NODE_1_AUTH_TLS_KEY: ${{ steps.client-certs.outputs.TLS_KEY }} - name: Get base registry run: | echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV @@ -70,14 +91,21 @@ jobs: echo "CACHE_KEY=${{ inputs.image-name }}-${{ inputs.cache-key }}" >> $GITHUB_ENV - name: Extract metadata (tags, labels) for Docker id: meta - uses: docker/metadata-action@v4.1.1 + uses: docker/metadata-action@v5.5.1 with: images: ${{ env.REGISTRY }}/${{ inputs.image-name }} tags: | type=sha,prefix=${{ env.TAG_PREFIX }},suffix=${{ env.TAG_SUFFIX }},format=short + - name: Initialize registry credentials file + env: + USER: ${{ github.actor }} + PASS: ${{ secrets.GITHUB_TOKEN }} + run: | + jq -n '.auths."ghcr.io" = { username: env.USER, password: env.PASS }' \ + | install -m400 /dev/stdin ~/.docker/config.json - name: Build and push Docker image id: docker-build - uses: docker/build-push-action@v3.2.0 + uses: docker/build-push-action@v6.9.0 with: context: ${{ inputs.folder }} build-args: |- @@ -87,6 +115,11 @@ jobs: labels: ${{ steps.meta.outputs.labels }} cache-from: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }} cache-to: type=registry,ref=${{ env.REGISTRY }}/buildcache:${{ env.CACHE_KEY || inputs.image-name }},mode=max + platforms: ${{ inputs.platforms }} + - name: Clear registry credentials + if: always() + run: | + rm -f ~/.docker/config.json && [ ! -e ~/.docker/config.json ] - uses: 8BitJonny/gh-get-current-pr@2.1.3 id: PR with: diff --git a/.github/workflows/read-configuration.yml b/.github/workflows/read-configuration.yml index 12a21b31..25f5de3e 100644 --- a/.github/workflows/read-configuration.yml +++ b/.github/workflows/read-configuration.yml @@ -17,12 +17,16 @@ on: jobs: read-file: name: Read Configuration File - runs-on: ["self-hosted", "Linux"] + runs-on: [ cw ] + container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0' + defaults: + run: + shell: bash permissions: {} outputs: config: ${{ steps.read.outputs.contents }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Read configuration id: read env: diff --git a/.github/workflows/sglang.yml b/.github/workflows/sglang.yml new file mode 100644 index 00000000..a851ecba --- /dev/null +++ b/.github/workflows/sglang.yml @@ -0,0 +1,30 @@ +on: + workflow_dispatch: + inputs: + tag: + description: 'Tag for the build' + required: true + base-image: + description: 'Base image from which to build' + required: true + builder-image: + description: 'Image to use to compile wheels, if different from the base image' + required: false + push: + paths: + - "sglang/**" + - ".github/workflows/sglang.yml" + - ".github/workflows/build.yml" + + +jobs: + build: + uses: ./.github/workflows/build.yml + secrets: inherit + with: + image-name: sglang + folder: sglang + tag-suffix: ${{ inputs.tag || '386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1' }} + build-args: | + BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:es-actions-386fabe-nccl-cuda12.8.0-ubuntu22.04-nccl2.25.1-1-torch2.6.0-vision0.21.0-audio2.6.0-abi1'}} + ${{ inputs.base-image && 'BASE_IMAGE=' }}${{ inputs.base-image}} diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml index 93148a65..b93fbbae 100644 --- a/.github/workflows/torch-base.yml +++ b/.github/workflows/torch-base.yml @@ -35,11 +35,12 @@ jobs: secrets: inherit with: image-name: ${{ inputs.image-name }} - tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }} + tag: ${{ format('{0}-{1}', format('base-cuda{0}-{1}', matrix.cuda, matrix.os), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }} builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-${{ matrix.os }} base-image: nvidia/cuda:${{ matrix.cuda }}-base-${{ matrix.os }} torch-version: ${{ matrix.torch }} torchvision-version: ${{ matrix.vision }} torchaudio-version: ${{ matrix.audio }} + additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }} cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }} build-extras: true diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index ca7134ed..e37a6c18 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -51,13 +51,17 @@ jobs: get-required-bases: name: Get Latest Required Base Images if: inputs.skip-bases-check != true - runs-on: ["self-hosted", "Linux"] + runs-on: [ cw ] + container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.4.0' + defaults: + run: + shell: bash permissions: packages: read outputs: bases-list: ${{ steps.choose-bases.outputs.list }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Check if torch-extras needs to be rebuilt from previous bases diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml index aee13052..ede0fdf0 100644 --- a/.github/workflows/torch-nccl.yml +++ b/.github/workflows/torch-nccl.yml @@ -43,11 +43,12 @@ jobs: secrets: inherit with: image-name: ${{ inputs.image-name }} - tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.image.cuda, matrix.image.os, matrix.image.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }} - builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} - base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} + tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-{1}-nccl{2}', matrix.cuda, matrix.os, matrix.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}-abi{3}', matrix.torch, matrix.vision, matrix.audio, matrix.abi)) }} + builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} + base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} torch-version: ${{ matrix.torch }} torchvision-version: ${{ matrix.vision }} torchaudio-version: ${{ matrix.audio }} - cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }} + additional-build-args: BUILD_CXX11_ABI=${{ matrix.abi }} + cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }} build-extras: true diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 063e40af..139d23d9 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -19,7 +19,11 @@ jobs: get-nightly-info: name: Get Nightly Info - runs-on: [ self-hosted, Linux ] + runs-on: [ cw ] + container: 'ghcr.io/coreweave/github-actions-images/github-base-runner:v1.9.0' + defaults: + run: + shell: bash outputs: pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }} triton-commit: ${{ steps.get-hash.outputs.triton-commit }} @@ -89,13 +93,13 @@ jobs: uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-base.yml - filter: del(.include) + filter: 'del(.include) | .exclude |= . + [{"abi": "0"}]' get-nccl-config: name: Get torch:nccl Config uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-nccl.yml - filter: del(.include) + filter: 'del( .include[] | ( .torch, .vision, .audio ) ) | .exclude |= . + [{"abi": "0"}]' build-base: name: Build Nightly torch:base @@ -115,7 +119,7 @@ jobs: torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }} torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} - triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }} + additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }} cache-key: base-cuda${{ matrix.cuda }}-${{ matrix.os }} build-extras: true build-nccl: @@ -130,12 +134,12 @@ jobs: secrets: inherit with: image-name: nightly-torch - tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.image.cuda, matrix.image.os, matrix.image.nccl, needs.get-nightly-info.outputs.version-string ) }} - builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} - base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-${{ matrix.image.cudnn }}-devel-${{ matrix.image.os }}-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} + tag: ${{ format('nccl-{0}-cuda{1}-{2}-nccl{3}-{4}', needs.get-nightly-info.outputs.date, matrix.cuda, matrix.os, matrix.nccl, needs.get-nightly-info.outputs.version-string ) }} + builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} + base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.cuda }}-devel-${{ matrix.os }}-nccl${{ matrix.nccl }}-${{ matrix.nccl-tests-hash }} torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }} torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} - triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }} - cache-key: nccl-cuda${{ matrix.image.cuda }}-${{ matrix.image.os }} + additional-build-args: BUILD_TRITON_VERSION=${{ needs.get-nightly-info.outputs.triton-commit }} + cache-key: nccl-cuda${{ matrix.cuda }}-${{ matrix.os }} build-extras: true diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml index 36bdcc6d..938b4306 100644 --- a/.github/workflows/torch.yml +++ b/.github/workflows/torch.yml @@ -19,13 +19,9 @@ on: torchaudio-version: required: true type: string - triton-version: + additional-build-args: required: false type: string - cuda-arch-support: - required: false - type: string - default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX" image-name: required: false type: string @@ -63,15 +59,10 @@ on: required: true description: "Tagged version number from pytorch/audio to build" type: string - triton-version: - required: false - description: "Tagged version number from openai/triton to build" - type: string - cuda-arch-support: + additional-build-args: required: false - description: "Space-separated list of CUDA architectures to support" + description: "Further --build-arg parameters for the build" type: string - default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX" image-name: required: false description: "Custom name under which to publish the resulting container" @@ -99,8 +90,7 @@ jobs: BUILD_TORCH_VERSION=${{ inputs.torch-version }} BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }} BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }} - ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }} - ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }} + ${{ inputs.additional-build-args }} build-extras: name: Build torch-extras if: inputs.build-extras diff --git a/sglang/Dockerfile b/sglang/Dockerfile new file mode 100644 index 00000000..2103ca20 --- /dev/null +++ b/sglang/Dockerfile @@ -0,0 +1,28 @@ +# syntax=docker/dockerfile:1.2 +ARG BASE_IMAGE +ARG BUILDER_IMAGE="${BASE_IMAGE}" + +FROM ${BUILDER_IMAGE} AS builder + +ARG BUILD_TORCH_CUDA_ARCH_LIST='8.0 8.6 8.9 9.0 10.0+PTX' + +ARG FLASHINFER_COMMIT='c04755e21f4d6fb7813c703f2b00a7ef012be9b8' +ARG CUTLASS_COMMIT='b78588d1630aa6643bf021613717bafb705df4ef' +ARG VLLM_COMMIT='5095e966069b9e65b7c4c63427e06cebacaad0a0' +ARG SGLANG_COMMIT='4b6f62e2bc52a528551e9a21e7b0a4945c6115bb' +ARG DECORD_COMMIT='d2e56190286ae394032a8141885f76d5372bd44b' +# Building Triton is not currently enabled, +# but this is the commit that would be used if it were +ARG TRITON_COMMIT='1e0e51c4aeb3e1beea000da5d0e494f8b9ac40dd' + +WORKDIR /build +COPY build.bash /build/ +RUN mkdir /wheels && \ + bash build.bash -a "${BUILD_TORCH_CUDA_ARCH_LIST}" && \ + rm -rf /build/* +COPY install.bash /wheels/ + +FROM ${BASE_IMAGE} +RUN --mount=type=bind,from=builder,source=/wheels,target=/wheels \ + cd /wheels && \ + bash install.bash diff --git a/sglang/build.bash b/sglang/build.bash new file mode 100644 index 00000000..d72d7a37 --- /dev/null +++ b/sglang/build.bash @@ -0,0 +1,150 @@ +#!/bin/bash +set -xeo pipefail +export DEBIAN_FRONTEND=noninteractive + +TORCH_CUDA_ARCH_LIST='' +FILTER_ARCHES='' +BUILD_TRITON='' + +while getopts 'a:ft' OPT; do + case "${OPT}" in + a) TORCH_CUDA_ARCH_LIST="${OPTARG}" ;; + f) FILTER_ARCHES='1' ;; + t) BUILD_TRITON='1' ;; + *) exit 92 ;; + esac +done + +export NVCC_APPEND_FLAGS='-gencode=arch=compute_100,code=[sm_100,compute_100] -gencode=arch=compute_100a,code=sm_100a --diag-suppress 174' +export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0 10.0+PTX}" + +mkdir -p /wheels/logs + +_BUILD() { python3 -m build -w -n -v -o /wheels "${1:-.}"; } +_LOG() { tee -a "/wheels/logs/${1:?}"; } +_CONSTRAINTS="$(python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p')" +_PIP_INSTALL() { + python3 -m pip install --no-cache-dir \ + --constraint=/dev/stdin <<< "${_CONSTRAINTS}" \ + "$@" +} + +_PIP_INSTALL -U pip setuptools wheel build pybind11 ninja cmake + +# triton (not compatible with torch 2.6) +if [ "${BUILD_TRITON}" = 1 ]; then ( + : "${TRITON_COMMIT:?}" + echo 'Building triton-lang/triton' + git clone --recursive --filter=blob:none https://github.com/triton-lang/triton + cd triton + git checkout "${TRITON_COMMIT}" + _BUILD python |& _LOG triton.log +); fi + +# flashinfer +: "${FLASHINFER_COMMIT:?}" +: "${CUTLASS_COMMIT:?}" +( +echo 'Building flashinfer-ai/flashinfer' +git clone --recursive --filter=blob:none https://github.com/flashinfer-ai/flashinfer +cd flashinfer +git checkout "${FLASHINFER_COMMIT}" +sed -i 's/name = "flashinfer-python"/name = "flashinfer"/' pyproject.toml +git -C 3rdparty/cutlass checkout "${CUTLASS_COMMIT}" +_PIP_INSTALL -U optree +NVCC_APPEND_FLAGS="${NVCC_APPEND_FLAGS:+$NVCC_APPEND_FLAGS } --diag-suppress 20281,174" \ + FLASHINFER_ENABLE_AOT=1 _BUILD . |& _LOG flashinfer.log +) + +# Setup cutlass repo for vLLM to use +git clone --recursive --filter=blob:none https://github.com/NVIDIA/cutlass +git -C cutlass checkout "${CUTLASS_COMMIT}" + +# vLLM +: "${VLLM_COMMIT:?}" +( +echo 'Building vllm-project/vllm' +export VLLM_CUTLASS_SRC_DIR="${PWD}/cutlass" +test -d "${VLLM_CUTLASS_SRC_DIR}" +git clone --recursive --filter=blob:none https://github.com/vllm-project/vllm +cd vllm +git checkout "${VLLM_COMMIT}" +# For lsmod +apt-get -qq update && apt-get -qq install --no-install-recommends -y kmod +python3 use_existing_torch.py +_PIP_INSTALL -r requirements-build.txt +USE_CUDNN=1 USE_CUSPARSELT=1 _BUILD . |& _LOG vllm.log +) + +# sglang +: "${SGLANG_COMMIT:?}" +( +echo 'Building sglang' +git clone --recursive --filter=blob:none https://github.com/sgl-project/sglang +cd sglang +git checkout "${SGLANG_COMMIT}" +( +cd sgl-kernel +git -C 3rdparty/cutlass checkout "${CUTLASS_COMMIT}" +git -C 3rdparty/flashinfer/3rdparty/cutlass checkout "${CUTLASS_COMMIT}" + +ARCH_TRIPLE="$(gcc -print-multiarch)" +LIB_DIR="/usr/lib/${ARCH_TRIPLE:?}" +test -d "${LIB_DIR:?}" +PYTHON_API_VER="$( + python3 --version | sed -En 's@Python ([0-9])\.([0-9]+)\..*@cp\1\2@p' +)" +ARCH_FILTER=() +if [ "${FILTER_ARCHES}" = 1 ]; then + ARCH_FILTER=(-e 's@"-gencode=arch=compute_[78][0-9],code=sm_[78][0-9]",@#\0@') +fi + +sed -Ei \ + "${ARCH_FILTER[@]}" \ + -e 's@/usr/lib/x86_64-linux-gnu@'"${LIB_DIR}"'@' \ + -e 's@(\s+)(\w.+manylinux2014_x86_64.+)@\1pass # \2@' \ + -e 's@\{"py_limited_api": "cp39"}@{"py_limited_api": "'"${PYTHON_API_VER:-cp310}"'"}@' \ + setup.py +SGL_KERNEL_ENABLE_BF16=1 SGL_KERNEL_ENABLE_FP8=1 SGL_KERNEL_ENABLE_SM90A=1 \ + _BUILD . |& _LOG sglang.log +) +_BUILD python |& _LOG sglang.log +) + +# decord and xgrammar aren't available on PyPI for ARM64 + +if [ ! "$(uname -m)" = 'x86_64' ]; then + # xgrammar (for sglang) + ( + git clone --recursive --filter=blob:none -b v0.1.11 https://github.com/mlc-ai/xgrammar && \ + cd xgrammar + ( + mkdir build && cd build + cmake -S.. -B. -DCMAKE_BUILD_TYPE=Release -GNinja |& _LOG xgrammar.log + cmake --build . |& _LOG xgrammar.log + ) + _BUILD python |& _LOG xgrammar.log + ) + + # decord (for sglang) + : "${DECORD_COMMIT:?}" + ( + apt-get -qq update && apt-get -q install --no-install-recommends -y \ + build-essential python3-dev python3-setuptools \ + make cmake ffmpeg \ + libavcodec-dev libavfilter-dev libavformat-dev libavutil-dev + git clone --recursive --filter=blob:none https://github.com/dmlc/decord + cd decord + git checkout "${DECORD_COMMIT}" + ( + mkdir build && cd build + cmake -S.. -B. -DUSE_CUDA=0 -DCMAKE_BUILD_TYPE=Release -GNinja |& _LOG decord.log + cmake --build . |& _LOG decord.log + cp libdecord.so /wheels/libdecord.so + ) + cd python + _BUILD . |& _LOG decord.log + ) +fi + +apt-get clean diff --git a/sglang/install.bash b/sglang/install.bash new file mode 100644 index 00000000..07c23b6b --- /dev/null +++ b/sglang/install.bash @@ -0,0 +1,33 @@ +#!/bin/bash +set -xeo pipefail +export DEBIAN_FRONTEND=noninteractive + +_CONSTRAINTS="$( + python3 -m pip list | sed -En 's@^(torch(vision|audio)?)\s+(\S+)$@\1==\3@p' +)" +_PIP_INSTALL() { + python3 -m pip install --no-cache-dir \ + --constraint=/dev/stdin <<< "${_CONSTRAINTS}" \ + "$@" +} + +_PIP_INSTALL /wheels/*.whl +if [ -x /wheels/libdecord.so ]; then + apt-get -qq update && apt-get -q install --no-install-recommends -y \ + libavfilter7 libavformat58 && \ + apt-get clean + cp /wheels/libdecord.so /usr/local/lib/ && ldconfig +fi + +SGLANG_EXTRA_PIP_DEPENDENCIES=() +if [ "$(uname -m)" = 'x86_64' ]; then + SGLANG_EXTRA_PIP_DEPENDENCIES=('decord' 'xgrammar>=0.1.10') +fi +_PIP_INSTALL \ + 'aiohttp' 'fastapi' \ + 'hf_transfer' 'huggingface_hub' 'interegular' 'modelscope' \ + 'orjson' 'packaging' 'pillow' 'prometheus-client>=0.20.0' \ + 'psutil' 'pydantic' 'python-multipart' 'pyzmq>=25.1.2' \ + 'torchao>=0.7.0' 'uvicorn' 'uvloop' \ + 'cuda-python' 'outlines>=0.0.44,<0.1.0' \ + "${SGLANG_EXTRA_PIP_DEPENDENCIES[@]}" diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 3c785f0f..51346e0b 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -2,8 +2,11 @@ ARG BASE_IMAGE ARG DEEPSPEED_VERSION="0.14.4" -ARG APEX_COMMIT="23c1f86520e22b505e8fdfcf6298273dff2d93d8" +ARG APEX_COMMIT="a1df80457ba67d60cbdb0d3ddfb08a2702c821a8" +ARG DEEPSPEED_KERNELS_COMMIT="e77acc40b104696d4e73229b787d1ef29a9685b1" +ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST="80;86;89;90" ARG XFORMERS_VERSION="0.0.28.post1" +ARG BUILD_MAX_JOBS="" FROM alpine/git:2.36.3 as apex-downloader WORKDIR /git @@ -16,6 +19,18 @@ RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \ --depth 1 --filter=blob:none && \ find -type d -name docs -prune -exec rm -r '{}' ';' + +FROM alpine/git:2.36.3 as ds-kernels-downloader +WORKDIR /git +ARG DEEPSPEED_KERNELS_COMMIT +RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \ + https://github.com/microsoft/DeepSpeed-Kernels ds-kernels && \ + cd ds-kernels && \ + git checkout "${DEEPSPEED_KERNELS_COMMIT}" && \ + git submodule update --init --recursive --jobs 8 \ + --depth 1 --filter=blob:none + + # Dependencies requiring NVCC are built ahead of time in a separate stage # so that the ~2 GiB dev library installations don't have to be included # in the final image. @@ -32,7 +47,6 @@ RUN export \ libcublas-dev-${CUDA_PACKAGE_VERSION} \ libcusparse-dev-${CUDA_PACKAGE_VERSION} \ libcusolver-dev-${CUDA_PACKAGE_VERSION} \ - cuda-nvprof-${CUDA_PACKAGE_VERSION} \ cuda-profiler-api-${CUDA_PACKAGE_VERSION} \ cuda-nvtx-${CUDA_PACKAGE_VERSION} \ cuda-nvrtc-dev-${CUDA_PACKAGE_VERSION} && \ @@ -58,18 +72,15 @@ RUN apt-get -qq update && apt-get -qq install -y \ # Update compiler (GCC) and linker (LLD) versions # gfortran-11 is just for compiler_wrapper.f95 -RUN CODENAME="$(lsb_release -cs)" && \ - wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ - apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \ - apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ +RUN LLVM_VERSION='18' && \ apt-get -qq update && apt-get -qq install --no-install-recommends -y \ - gcc-11 g++-11 gfortran-11 lld-17 && \ + gcc-11 g++-11 gfortran-11 "lld-$LLVM_VERSION" && \ apt-get clean && \ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \ update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \ update-alternatives --install \ /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \ - update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1 + update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1 RUN mkdir /wheels /build WORKDIR /build @@ -80,40 +91,69 @@ WORKDIR /build # The compiler wrapper normalizes -march=native to -march=skylake # along with a couple other transformations before invoking GCC. COPY compiler_wrapper.f95 . -RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 +ARG AMD64_NATIVE_ARCH="skylake" +ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres" +RUN if [ "$(uname -m)" = "aarch64" ]; then \ + NATIVE="WRAPPER_NATIVE=\"${ARM64_NATIVE_ARCH}\"" && \ + AVX='WRAPPER_NO_AVX'; \ + else \ + NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \ + AVX='WRAPPER_AVX="AVX256"'; \ + fi && \ + gfortran -ffree-line-length-512 -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 COPY --chmod=755 effective_cpu_count.sh . COPY --chmod=755 scale.sh . +ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=compute_90a" +RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \ + case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \ + FLAGS="${FLAGS} -gencode=arch=compute_100a,code=sm_100a" ;; \ + esac && \ + echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf +ARG BUILD_MAX_JOBS + FROM builder-base as deepspeed-builder + +ARG DEEPSPEED_KERNELS_CUDA_ARCH_LIST +RUN --mount=type=bind,from=ds-kernels-downloader,source=/git/ds-kernels,target=ds-kernels/,rw \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + cd ds-kernels && \ + export CUDA_ARCH_LIST="${DEEPSPEED_KERNELS_CUDA_ARCH_LIST}" && \ + echo "CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}" && \ + python3 -m pip wheel -w /wheels \ + --no-cache-dir --no-build-isolation --no-deps . && \ + python3 -m pip install /wheels/*.whl + # DeepSpeed build flags # See: https://www.deepspeed.ai/tutorials/advanced-install -ARG DS_BUILD_OPS="1" +ARG DS_BUILD_OPS="0" ARG DS_BUILD_CCL_COMM="0" -ARG DS_BUILD_CPU_ADAM="" -ARG DS_BUILD_CPU_LION="" +ARG DS_BUILD_CPU_ADAM="1" +ARG DS_BUILD_CPU_LION="1" # Requires CUTLASS ARG DS_BUILD_EVOFORMER_ATTN="0" -ARG DS_BUILD_FUSED_ADAM="" -ARG DS_BUILD_FUSED_LION="" -ARG DS_BUILD_CPU_ADAGRAD="" -ARG DS_BUILD_FUSED_LAMB="" -ARG DS_BUILD_QUANTIZER="" -ARG DS_BUILD_RANDOM_LTD="" +ARG DS_BUILD_FUSED_ADAM="1" +ARG DS_BUILD_FUSED_LION="1" +ARG DS_BUILD_CPU_ADAGRAD="1" +ARG DS_BUILD_FUSED_LAMB="1" +ARG DS_BUILD_QUANTIZER="1" +ARG DS_BUILD_RANDOM_LTD="1" # sparse_attn has issues with PyTorch >= 2.0.0 as of DeepSpeed 0.9.4 ARG DS_BUILD_SPARSE_ATTN="0" -ARG DS_BUILD_TRANSFORMER="" -ARG DS_BUILD_TRANSFORMER_INFERENCE="" -ARG DS_BUILD_STOCHASTIC_TRANSFORMER="" -ARG DS_BUILD_UTILS="" -ARG DS_BUILD_AIO="" +ARG DS_BUILD_TRANSFORMER="1" +ARG DS_BUILD_TRANSFORMER_INFERENCE="1" +ARG DS_BUILD_STOCHASTIC_TRANSFORMER="1" +ARG DS_BUILD_UTILS="1" +ARG DS_BUILD_AIO="1" ARG DEEPSPEED_VERSION SHELL ["/bin/bash", "-o", "pipefail", "-c"] -RUN python3 -m pip install -U --no-cache-dir \ - setuptools wheel pip deepspeed-kernels && \ +RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + python3 -m pip install -U --no-cache-dir \ + setuptools wheel pip py-cpuinfo && \ if python3 -m pip show torch | grep 'Version: 2\.[1-9]' > /dev/null; then \ # DeepSpeed's AIO extension is incompatible with PyTorch 2.1.x's # requirement for C++17 (as of DeepSpeed 0.10.1). @@ -144,9 +184,9 @@ RUN python3 -m pip install -U --no-cache-dir \ do if [[ -z ${!VAR} ]]; then unset ${VAR}; fi; done; \ } && \ CC=$(realpath -e ./compiler) \ - MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)" \ + MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 4 24)}" \ python3 -m pip wheel -w /wheels \ - --no-cache-dir --no-build-isolation --no-deps \ + --no-cache-dir --no-build-isolation --no-deps -v \ deepspeed==${DEEPSPEED_VERSION} && \ rm ./* SHELL ["/bin/sh", "-c"] @@ -164,6 +204,7 @@ RUN LIBNCCL2_VERSION=$(dpkg-query --showformat='${Version}' --show libnccl2) && # --distributed_adam, --distributed_lamb, and --group_norm aren't documented # in the Apex README, but are defined in its setup.py config. RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ python3 -m pip install -U --no-cache-dir \ packaging setuptools wheel pip && \ CUDA_MAJOR_VERSION=$(echo "${CUDA_VERSION}" | cut -d. -f1) && \ @@ -178,8 +219,7 @@ RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \ :; \ )" && \ export CC=$(realpath -e ./compiler) && \ - export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 24)" && \ - export NVCC_APPEND_FLAGS='-diag-suppress 186,177' && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 24)}" && \ printf -- '--config-settings="--build-option=%s" ' $( \ echo \ --cpp_ext \ @@ -218,12 +258,12 @@ FROM builder-base as xformers-builder ARG XFORMERS_VERSION SHELL ["/bin/bash", "-o", "pipefail", "-c"] -RUN python3 -m pip install -U --no-cache-dir \ +RUN export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + python3 -m pip install -U --no-cache-dir \ setuptools wheel pip && \ CC=$(realpath -e ./compiler) \ MAX_JOBS=1 \ PYTHONUNBUFFERED=1 \ - NVCC_APPEND_FLAGS='-diag-suppress 186,177' \ XFORMERS_DISABLE_FLASH_ATTN=1 \ python3 -m pip wheel -w /wheels -v \ --no-cache-dir --no-build-isolation --no-deps \ diff --git a/torch-extras/compiler_wrapper.f95 b/torch-extras/compiler_wrapper.f95 index f8c13bd2..cbdc602e 100644 --- a/torch-extras/compiler_wrapper.f95 +++ b/torch-extras/compiler_wrapper.f95 @@ -1,13 +1,25 @@ +#ifndef WRAPPER_NATIVE +#define WRAPPER_NATIVE "skylake" +#endif + +#ifndef WRAPPER_CC +#define WRAPPER_CC "gcc" +#endif + +#ifndef WRAPPER_AVX +#define WRAPPER_AVX "AVX256" +#endif + PROGRAM compiler_wrapper - ! Wraps GCC invocations, - ! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions - ! with -D__AVX256__, and -march=native with -march=skylake, + ! Wraps C compiler invocations, + ! replacing -D__AVX512__, -D__AVX256__, and -D__SCALAR__ preprocessor definitions + ! with -D____, and -march=native with -march=, ! for better reproducibility and compatibility. IMPLICIT NONE INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0 CHARACTER(len=:), ALLOCATABLE :: arg, command ALLOCATE(CHARACTER(len=128) :: arg) - command = "gcc" + command = WRAPPER_CC DO i = 1, COMMAND_ARGUMENT_COUNT() DO @@ -22,9 +34,15 @@ PROGRAM compiler_wrapper END IF END DO IF (arg == "-march=native") THEN - command = command // " '-march=skylake'" - ELSE IF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN - command = command // " '-D__AVX256__'" + command = command // (" '-march=" // WRAPPER_NATIVE // "'") + ELSE IF ( & + arg == "-D__AVX512__" & + .OR. arg == "-D__AVX256__" & + .OR. arg == "-D__SCALAR__" & + ) THEN +#ifndef WRAPPER_NO_AVX + command = command // (" '-D__" // WRAPPER_AVX // "__'") +#endif ELSE command = command // shell_escaped(arg) END IF diff --git a/torch/Dockerfile b/torch/Dockerfile index 6705427d..e070232a 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -1,15 +1,17 @@ -# syntax=docker/dockerfile:1.4 -ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.4.1-devel-ubuntu22.04" -ARG FINAL_BASE_IMAGE="nvidia/cuda:12.4.1-base-ubuntu22.04" +# syntax=docker/dockerfile:1.7 +ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.8.0-devel-ubuntu22.04" +ARG FINAL_BASE_IMAGE="nvidia/cuda:12.8.0-base-ubuntu22.04" -ARG BUILD_TORCH_VERSION="2.5.0" +ARG BUILD_TORCH_VERSION="2.5.1" ARG BUILD_TORCH_VISION_VERSION="0.20.0" ARG BUILD_TORCH_AUDIO_VERSION="2.5.0" -ARG BUILD_TRANSFORMERENGINE_VERSION="1.11" -ARG BUILD_FLASH_ATTN_VERSION="2.6.3" +ARG BUILD_TRANSFORMERENGINE_VERSION="1.13" +ARG BUILD_FLASH_ATTN_VERSION="2.7.4.post1" +ARG BUILD_FLASH_ATTN_3_VERSION="2.7.2.post1" ARG BUILD_TRITON_VERSION="" ARG BUILD_TRITON="1" -ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX" +ARG BUILD_TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0 10.0+PTX" +ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST="70;80;89;90;100" # 8.7 is supported in the PyTorch main branch, but not 2.0.0 @@ -19,42 +21,47 @@ ARG AOCL_URL="https://download.amd.com/developer/eula/aocl/aocl-4-2/aocl-linux-a # Clone PyTorch repositories independently from all other build steps # for cache-friendliness and parallelization -FROM alpine/git:2.40.1 as downloader-base +FROM alpine/git:2.40.1 AS downloader-base WORKDIR /git RUN git config --global advice.detachedHead false COPY <<-"EOT" /git/clone.sh - #!/bin/sh - REPO="https://github.com/$1"; - DEST="$2"; - REF="$3"; - - CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; }; - - # Try cloning REF as a tag prefixed with "v", otherwise fall back - # to git checkout for commit hashes - CLONE --recurse-submodules --shallow-submodules --also-filter-submodules --no-tags \ - "$REPO" -b "v$REF" "$DEST" || { \ - CLONE --no-single-branch --no-checkout "$REPO" "$DEST" && \ - git -C "$DEST" checkout "$REF" && \ - git -C "$DEST" submodule update --init --filter=blob:none --depth=1 --recursive --jobs 8; \ - }; + #!/bin/sh + REPO="https://github.com/$1"; + DEST="$2"; + REF="$3"; + + CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; }; + + # Try cloning REF as a tag prefixed with "v", otherwise fall back + # to git checkout for commit hashes + CLONE --recurse-submodules --shallow-submodules --also-filter-submodules --no-tags \ + "$REPO" -b "v$REF" "$DEST" || { \ + CLONE --no-single-branch --no-checkout "$REPO" "$DEST" && \ + git -C "$DEST" checkout "$REF" && \ + git -C "$DEST" submodule update --init --filter=blob:none --depth=1 --recursive --jobs 8; \ + }; EOT RUN chmod 755 /git/clone.sh -FROM downloader-base as pytorch-downloader +FROM downloader-base AS pytorch-downloader ARG BUILD_TORCH_VERSION +# Includes a patch for a foreach bug in PyTorch v2.5.1 RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \ + if [ "${BUILD_TORCH_VERSION}" = '2.5.1' ]; then \ + wget 'https://github.com/pytorch/pytorch/commit/1cdaf1d85f5e4b3f8952fd0737a1afeb16995d13.patch' -qO- \ + | git -C pytorch apply; \ + fi && \ rm -rf pytorch/.git -FROM downloader-base as torchvision-downloader +FROM downloader-base AS torchvision-downloader ARG BUILD_TORCH_VISION_VERSION RUN ./clone.sh pytorch/vision vision "${BUILD_TORCH_VISION_VERSION}" && \ rm -rf vision/.git -FROM downloader-base as torchaudio-downloader +FROM downloader-base AS torchaudio-downloader ARG BUILD_TORCH_AUDIO_VERSION RUN ./clone.sh pytorch/audio audio "${BUILD_TORCH_AUDIO_VERSION}" # The torchaudio build requires that this directory remain a full git repository, @@ -70,22 +77,23 @@ RUN if grep -qF '#include ' \ fi && \ rm /git/patch -FROM downloader-base as transformerengine-downloader +FROM downloader-base AS transformerengine-downloader ARG BUILD_TRANSFORMERENGINE_VERSION RUN ./clone.sh NVIDIA/TransformerEngine TransformerEngine "${BUILD_TRANSFORMERENGINE_VERSION}" -# Include a patch commit that is sort-of part of v1.11 but isn't in their v1.11 release git tag -# See https://github.com/NVIDIA/TransformerEngine/pull/1222 -RUN if [ "${BUILD_TRANSFORMERENGINE_VERSION}" = '1.11' ]; then \ - wget 'https://github.com/NVIDIA/TransformerEngine/commit/fc034785f5e3a5bc5600a88766d9a1d75137ce77.patch' -qO- \ - | git -C TransformerEngine apply -v --stat --apply -; \ - fi - -FROM downloader-base as flash-attn-downloader +FROM downloader-base AS flash-attn-downloader ARG BUILD_FLASH_ATTN_VERSION RUN ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_VERSION}" -FROM downloader-base as triton-version +FROM downloader-base AS flash-attn-3-downloader +ARG BUILD_FLASH_ATTN_3_VERSION +RUN if [ -n "$BUILD_FLASH_ATTN_3_VERSION" ]; then \ + ./clone.sh Dao-AILab/flash-attention flash-attention "${BUILD_FLASH_ATTN_3_VERSION}"; \ + else \ + mkdir flash-attention; \ + fi + +FROM downloader-base AS triton-version ENV TRITON_COMMIT_FILE='.ci/docker/ci_commit_pins/triton.txt' COPY --link --from=pytorch-downloader "/git/pytorch/${TRITON_COMMIT_FILE}" /git/version.txt ARG BUILD_TRITON_VERSION @@ -93,7 +101,7 @@ RUN if [ -n "${BUILD_TRITON_VERSION}" ]; then \ echo "${BUILD_TRITON_VERSION}" > /git/version.txt; \ fi -FROM downloader-base as triton-downloader +FROM downloader-base AS triton-downloader COPY --link --from=triton-version /git/version.txt /git/version.txt ARG BUILD_TRITON RUN if [ "${BUILD_TRITON}" = '1' ]; then \ @@ -102,7 +110,7 @@ RUN if [ "${BUILD_TRITON}" = '1' ]; then \ mkdir triton; \ fi -FROM alpine/curl:8.7.1 as aocl-downloader +FROM alpine/curl:8.7.1 AS aocl-downloader WORKDIR /tmp/install RUN apk add --no-cache bash @@ -128,7 +136,7 @@ RUN curl -sSfo- "${AOCL_URL}" | tar xzf - --strip-components 1 && \ ## Build PyTorch on a builder image. -FROM ${BUILDER_BASE_IMAGE} as builder-base +FROM ${BUILDER_BASE_IMAGE} AS builder-base-shared ENV DEBIAN_FRONTEND=noninteractive ARG BUILD_CCACHE_SIZE="1Gi" @@ -142,7 +150,7 @@ RUN apt-get -qq update && apt-get -qq install -y \ /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ - ln -s libomp.so.5 /usr/lib/x86_64-linux-gnu/libomp.so && \ + ln -s libomp.so.5 "/usr/lib/$(gcc -print-multiarch)/libomp.so" && \ ldconfig COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh @@ -160,12 +168,24 @@ RUN export \ rm /tmp/install_cudnn.sh && \ apt-get clean +# Add Kitware's apt repository to get a newer version of CMake +RUN apt-get -qq update && apt-get -qq install -y \ + software-properties-common lsb-release && \ + { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \ + | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \ + apt-add-repository -n "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ + apt-get -qq update && apt-get -qq install -y cmake && apt-get clean + RUN mkdir /tmp/ccache-install && \ cd /tmp/ccache-install && \ - CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2-linux-x86_64.tar.xz' && \ - wget -qO - $CCACHE_URL | tar --strip-components 1 -xJf - && \ + CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2.tar.xz' && \ + wget -qO - "$CCACHE_URL" | tar --strip-components 1 -xJf - && \ + mkdir build && \ + cd build && \ + cmake -B. -S.. -DCMAKE_BUILD_TYPE=Release && \ + cmake --build . --config Release && \ make install && \ - cd .. && \ + cd ../.. && \ rm -rf /tmp/ccache-install && \ ccache -M "${BUILD_CCACHE_SIZE}" && \ ccache -F 0 @@ -176,30 +196,35 @@ ENV CCACHE_DIR=/ccache \ CMAKE_CXX_COMPILER_LAUNCHER=ccache \ CMAKE_CUDA_COMPILER_LAUNCHER=ccache -# Add Kitware's apt repository to get a newer version of CMake -RUN apt-get -qq update && apt-get -qq install -y \ - software-properties-common lsb-release && \ - { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \ - | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \ - apt-add-repository -n "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ - apt-get -qq update && apt-get -qq install -y cmake && apt-get clean - # Update compiler (GCC) and linker (LLD) versions -RUN CODENAME="$(lsb_release -cs)" && \ +RUN LLVM_VERSION='18' && \ + CODENAME="$(lsb_release -cs)" && \ wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ - apt-add-repository -n "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \ + apt-add-repository -n "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-$LLVM_VERSION main" && \ SETUP_TOOLCHAIN() { \ apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \ | sed -e '/connection timed out/{p; Q1}' && \ - apt-get -qq install --no-install-recommends -y gcc-11 g++-11 gfortran-11 lld-17 && \ + apt-get -qq install --no-install-recommends -y \ + gcc-11 g++-11 gfortran-11 \ + "lld-$LLVM_VERSION" "libomp-$LLVM_VERSION-dev" && \ apt-get clean; \ } && \ { SETUP_TOOLCHAIN || { sleep "$(shuf -i10-20 -n1)" && SETUP_TOOLCHAIN; }; } && \ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \ update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \ update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \ - update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1 + if [ "$(uname -m)" != 'aarch64' ]; then \ + update-alternatives --install /usr/bin/ld ld "/usr/bin/ld.lld-$LLVM_VERSION" 1; \ + fi && \ + ldconfig + + +FROM builder-base-shared AS builder-base-arm64 +# There is currently no CPU BLAS used for ARM builds, +# so this stage is just an alias + +FROM builder-base-shared AS builder-base-amd64 # Install AOCL-BLAS and AOCL-LAPACK # See: https://www.amd.com/en/developer/aocl/dense.html ARG AOCL_BASE @@ -231,39 +256,50 @@ ENV C_INCLUDE_PATH="${AOCL_BASE}/include${C_INCLUDE_PATH:+:$C_INCLUDE_PATH}" \ LD_LIBRARY_PATH="${AOCL_BASE}/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" \ LIBRARY_PATH="${AOCL_BASE}/lib${LIBRARY_PATH:+:$LIBRARY_PATH}" + +FROM builder-base-${TARGETARCH} AS builder-base RUN mkdir /build /build/dist WORKDIR /build COPY --chmod=755 effective_cpu_count.sh . COPY --chmod=755 scale.sh . COPY compiler_wrapper.f95 . -RUN gfortran -O3 ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 +ARG AMD64_NATIVE_ARCH="skylake" +ARG ARM64_NATIVE_ARCH="armv8.5-a+nopredres" +RUN if [ "$(uname -m)" = "aarch64" ]; then \ + NATIVE="WRAPPER_NATIVE=\"${ARM64_NATIVE_ARCH}\"" && \ + AVX='WRAPPER_NO_AVX'; \ + else \ + NATIVE="WRAPPER_NATIVE=\"${AMD64_NATIVE_ARCH}\"" && \ + AVX='WRAPPER_AVX="AVX256"'; \ + fi && \ + gfortran -ffree-line-length-512 -cpp -O3 "-D${NATIVE}" "-D${AVX}" ./compiler_wrapper.f95 -o ./compiler && rm ./compiler_wrapper.f95 COPY <<-"EOT" /build/version-string.sh - #!/bin/sh - set -e; - VERSION="$1"; - - IS_HASH() { - echo "$1" | grep -qxiEe '[0-9a-f]{40}'; - }; - - if IS_HASH "$VERSION"; then - REAL_VERSION="$(cat ./version.txt)"; - SHORT_HASH="$(echo "$VERSION" | cut -c1-7)"; - echo "$REAL_VERSION+$SHORT_HASH"; - else - echo "$VERSION"; - fi; + #!/bin/sh + set -e; + VERSION="$1"; + + IS_HASH() { + echo "$1" | grep -qxiEe '[0-9a-f]{40}'; + }; + + if IS_HASH "$VERSION"; then + REAL_VERSION="$(cat ./version.txt)"; + SHORT_HASH="$(echo "$VERSION" | cut -c1-7)"; + echo "$REAL_VERSION+$SHORT_HASH"; + else + echo "$VERSION"; + fi; EOT RUN chmod 755 /build/version-string.sh COPY <<-"EOT" /build/storage-info.sh - #!/bin/sh - set -e; - TARGET="$(realpath "$1")"; + #!/bin/sh + set -e; + TARGET="$(realpath "$1")"; - STORAGE_INFO="$(df -h '--output=fstype,used,avail,pcent,target' "$TARGET")" || exit 0; - printf 'Storage info for %s:\n%s\n' "$TARGET" "$STORAGE_INFO"; + STORAGE_INFO="$(df -h '--output=fstype,used,avail,pcent,target' "$TARGET")" || exit 0; + printf 'Storage info for %s:\n%s\n' "$TARGET" "$STORAGE_INFO"; EOT RUN chmod 755 /build/storage-info.sh @@ -280,10 +316,12 @@ ENV CMAKE_PREFIX_PATH=/usr/bin/ \ CUDNN_LIB_DIR=/usr/local/cuda/lib64 ARG BUILD_TRITON +ARG BUILD_MAX_JOBS="" RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \ --mount=type=cache,target=/ccache \ if [ "$BUILD_TRITON" = '1' ]; then \ - export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \ + pip3 install --no-cache-dir pybind11 && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ cd triton/python && \ python3 -m pip wheel -w ../../dist/ --no-build-isolation --no-deps -vv . && \ pip3 install ../../dist/*.whl; \ @@ -292,7 +330,19 @@ RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,r ARG BUILD_TORCH_VERSION ARG BUILD_TORCH_CUDA_ARCH_LIST ENV TORCH_VERSION=$BUILD_TORCH_VERSION -ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST +# Filter out the 10.0 arch on CUDA versions != 12.8 +ENV TORCH_CUDA_ARCH_LIST="${CUDA_VERSION##12.8.*}||${BUILD_TORCH_CUDA_ARCH_LIST/ 10.0/}||${BUILD_TORCH_CUDA_ARCH_LIST}" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#||*||}" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST%||*}" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}" + +ARG BUILD_NVCC_APPEND_FLAGS="-gencode=arch=compute_90a,code=sm_90a" +# Add sm_100a build if NV_CUDA_LIB_VERSION matches 12.[89].* +RUN FLAGS="$BUILD_NVCC_APPEND_FLAGS" && \ + case "${NV_CUDA_LIB_VERSION}" in 12.[89].*) \ + FLAGS="${FLAGS} -gencode=arch=compute_100a,code=sm_100a" ;; \ + esac && \ + echo "-Wno-deprecated-gpu-targets -diag-suppress 191,186,177${FLAGS:+ $FLAGS}" > /build/nvcc.conf # If the directory /opt/nccl-tests exists, # the base image is assumed to be nccl-tests, @@ -320,16 +370,26 @@ ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST # (See: https://github.com/pytorch/pytorch/blob/v2.3.0/cmake/Modules/FindBLAS.cmake#L259-L271). # Without WITH_BLAS, it would detect the BLAS implementation as # BLAS_INFO=blis instead of BLAS_INFO=FLAME and wouldn't include LAPACK either. +ARG BUILD_CXX11_ABI="" +SHELL ["/bin/bash", "-eo", "pipefail", "-c"] RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \ --mount=type=cache,target=/ccache \ - export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ echo "MAX_JOBS: ${MAX_JOBS}" && \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ + if [ -n "${BUILD_CXX11_ABI}" ]; then \ + export _GLIBCXX_USE_CXX11_ABI="${BUILD_CXX11_ABI}"; \ + fi && \ ./storage-info.sh . && \ cd pytorch && \ ../storage-info.sh . && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ ln -s /usr/bin/c++ build/c++ && \ + if [ "$(uname -m)" = 'aarch64' ]; then \ + export USE_PRIORITIZED_TEXT_FOR_LD=1; \ + fi && \ { if [ -d /opt/nccl-tests ]; then \ export \ USE_DISTRIBUTED=1 \ @@ -350,16 +410,16 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch WITH_BLAS=FLAME \ PYTORCH_BUILD_VERSION="$(../version-string.sh "$TORCH_VERSION")" \ PYTORCH_BUILD_NUMBER=0 \ - TORCH_NVCC_FLAGS="-Xfatbin -compress-all -diag-suppress 191,186,177" \ - python3 setup.py bdist_wheel --dist-dir ../dist + TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ + python3 setup.py bdist_wheel --dist-dir ../dist 2>&1 \ + | grep -Ev --line-buffered '^(ptxas /tmp/|copying .+/|creating build/)' +SHELL ["/bin/sh", "-c"] RUN pip3 install --no-cache-dir --upgrade dist/torch*.whl -ENV NVCC_APPEND_FLAGS="-diag-suppress 191,186,177" - RUN python3 -m pip install -U --no-cache-dir \ packaging setuptools wheel pip -FROM builder-base as torchvision-builder +FROM builder-base AS torchvision-builder RUN rm ./dist/* ## Build torchvision @@ -370,7 +430,10 @@ RUN pip3 install --no-cache-dir --upgrade \ RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=vision/,rw \ --mount=type=cache,target=/ccache \ - export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ + echo "MAX_JOBS: ${MAX_JOBS}" && \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ cd vision && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ @@ -396,7 +459,7 @@ RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=visi TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ python3 setup.py bdist_wheel --dist-dir ../dist -FROM builder-base as torchaudio-builder +FROM builder-base AS torchaudio-builder RUN rm ./dist/* ## Build torchaudio @@ -407,7 +470,10 @@ RUN pip3 install --no-cache-dir --upgrade \ RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/,rw \ --mount=type=cache,target=/ccache \ - export MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)" && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 3 32)}" && \ + echo "MAX_JOBS: ${MAX_JOBS}" && \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ cd audio && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ @@ -433,12 +499,23 @@ RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/ TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ python3 setup.py bdist_wheel --dist-dir ../dist -FROM builder-base as transformerengine-builder +FROM builder-base AS transformerengine-builder RUN rm ./dist/* # Build TransformerEngine +ARG BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST +ENV NVTE_CUDA_ARCHS=$BUILD_TRANSFORMERENGINE_CUDA_ARCH_LIST + RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerEngine,target=TransformerEngine/,rw \ + --mount=type=cache,target=/ccache \ export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \ + export MAX_JOBS="${BUILD_MAX_JOBS:-$MAX_JOBS}" && \ + echo "MAX_JOBS: ${MAX_JOBS}" && \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ + case "${CUDA_VERSION}" in 12.[0123456].*) \ + export NVTE_CUDA_ARCHS="${NVTE_CUDA_ARCHS%;100*}" ;; \ + esac && \ cd TransformerEngine && \ if python3 -c "import sys; sys.exit(sys.version_info.minor > 8)"; then \ sed -i "s/from functools import cache/from functools import lru_cache as cache/g" \ @@ -446,40 +523,67 @@ RUN --mount=type=bind,from=transformerengine-downloader,source=/git/TransformerE fi && \ python3 setup.py bdist_wheel --dist-dir /build/dist -FROM builder-base as flash-attn-builder +FROM builder-base AS flash-attn-builder-base RUN rm ./dist/* +ENV PYTHONUNBUFFERED=1 +ENV FLASH_ATTENTION_FORCE_BUILD=TRUE +ARG BUILD_FLASH_ATTN_MAX_JOBS="" + +COPY <<-"EOT" /build/fa-build.sh + #!/bin/bash + set -eo pipefail; + if [ -n "$1" ]; then cd "$1"; fi; + python3 setup.py bdist_wheel --dist-dir /build/dist \ + | grep -Ev --line-buffered '^ptxas (/tmp/|(info|warning)\s*:)|bytes spill stores' +EOT +RUN chmod 755 /build/fa-build.sh + +FROM flash-attn-builder-base AS flash-attn-builder -SHELL ["/bin/bash", "-o", "pipefail", "-c"] +# Build flash-attn RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \ + --mount=type=cache,target=/ccache \ export CC=$(realpath -e ./compiler) \ - MAX_JOBS="$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)" \ - PYTHONUNBUFFERED=1 \ - FLASH_ATTENTION_FORCE_BUILD='TRUE' && \ + MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 8 12)}" && \ + echo "MAX_JOBS: ${MAX_JOBS}" && \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ cd flash-attention && \ - ( \ - for EXT_DIR in $(realpath -s -e \ - . \ - csrc/ft_attention \ - csrc/fused_dense_lib \ - csrc/fused_softmax \ - csrc/layer_norm \ - csrc/rotary \ - csrc/xentropy); \ - do \ - cd $EXT_DIR && \ - python3 setup.py bdist_wheel --dist-dir /build/dist && \ - cd - || \ - exit 1; \ - done; \ - ) | \ - grep -Ev --line-buffered 'ptxas info\s*:|bytes spill stores' -SHELL ["/bin/sh", "-c"] + for EXT_DIR in $(realpath -s -e \ + . \ + csrc/ft_attention \ + csrc/fused_dense_lib \ + csrc/fused_softmax \ + csrc/layer_norm \ + csrc/rotary \ + csrc/xentropy); \ + do /build/fa-build.sh "$EXT_DIR" || exit 1; done + +FROM flash-attn-builder-base AS flash-attn-3-builder + +# Artifically sequence this build stage after the previous one +# to prevent parallelism, because these are both very resource-intensive +RUN --mount=type=bind,from=flash-attn-builder,source=/build,target=/build : + +# Build flash-attn v3 +RUN --mount=type=bind,from=flash-attn-3-downloader,source=/git/flash-attention,target=flash-attention/,rw \ + --mount=type=cache,target=/ccache \ + if [ ! -d flash-attention/hopper ]; then \ + echo "Not compiling flash-attn v3" && exit 0; \ + fi && \ + export CC=$(realpath -e ./compiler) \ + MAX_JOBS="${BUILD_FLASH_ATTN_MAX_JOBS:-$(./scale.sh "$(./effective_cpu_count.sh)" 10 6)}" && \ + echo "MAX_JOBS: ${MAX_JOBS}" && \ + export NVCC_APPEND_FLAGS="$(cat /build/nvcc.conf)" && \ + echo "NVCC_APPEND_FLAGS: ${NVCC_APPEND_FLAGS}" && \ + /build/fa-build.sh flash-attention/hopper -FROM builder-base as builder +FROM builder-base AS builder COPY --link --from=torchaudio-builder /build/dist/ /build/dist/ COPY --link --from=torchvision-builder /build/dist/ /build/dist/ COPY --link --from=transformerengine-builder /build/dist/ /build/dist/ COPY --link --from=flash-attn-builder /build/dist/ /build/dist/ +COPY --link --from=flash-attn-3-builder /build/dist/ /build/dist/ ## Build the final torch image. FROM ${FINAL_BASE_IMAGE} @@ -496,11 +600,11 @@ RUN apt-get -qq update && apt-get -qq install -y \ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ update-alternatives --install /usr/bin/vim vim /usr/bin/vim.tiny 1 && \ - ln -s libomp.so.5 /usr/lib/x86_64-linux-gnu/libomp.so && \ + ln -s libomp.so.5 "/usr/lib/$(gcc -print-multiarch)/libomp.so" && \ ldconfig RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \ - software-properties-common && \ + software-properties-common lsb-release && \ SETUP_LIBSTDCXX() { \ apt-add-repository -y ppa:ubuntu-toolchain-r/test 2>&1 \ | sed -e '/connection timed out/{p; Q1}' && \ @@ -509,6 +613,13 @@ RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \ } && \ { SETUP_LIBSTDCXX || { sleep "$(shuf -i10-20 -n1)" && SETUP_LIBSTDCXX; }; } +RUN LLVM_VERSION='18' && \ + CODENAME="$(lsb_release -cs)" && \ + wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ + apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-$LLVM_VERSION main" && \ + apt-get -qq install -y --no-install-recommends "libomp5-$LLVM_VERSION" && \ + apt-get clean + # Install AOCL-BLAS and AOCL-LAPACK # See: https://www.amd.com/en/developer/aocl/dense.html ARG AOCL_BASE @@ -525,7 +636,11 @@ ARG BUILD_TORCH_CUDA_ARCH_LIST ENV TORCH_VERSION=$BUILD_TORCH_VERSION ENV TORCH_VISION_VERSION=$BUILD_TORCH_VISION_VERSION ENV TORCH_AUDIO_VERSION=$BUILD_TORCH_AUDIO_VERSION -ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST +# Filter out the 10.0 arch on CUDA versions != 12.8 +ENV TORCH_CUDA_ARCH_LIST="${CUDA_VERSION##12.8.*}||${BUILD_TORCH_CUDA_ARCH_LIST/ 10.0/}||${BUILD_TORCH_CUDA_ARCH_LIST}" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#||*||}" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST%||*}" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST#*||}" COPY --link --chmod=755 install_cudnn.sh /tmp/install_cudnn.sh # - libnvjitlink-X-Y only exists for CUDA versions >= 12-0. @@ -564,3 +679,51 @@ WORKDIR /usr/src/app RUN --mount=type=bind,from=builder,source=/build/dist,target=. \ pip3 install --no-cache-dir -U numpy packaging && \ pip3 install --no-cache-dir -U ./*.whl + +# Make a symlink to flash-attn v3 where TransformerEngine expects it, +# and modify the installation record so that pip uninstall knows how to +# fully remove it. +RUN <<-"EOT" python3 + #!/bin/env python3 + from base64 import urlsafe_b64encode as b64 + from hashlib import sha256 + from importlib import metadata + from pathlib import Path + from py_compile import compile + + dist = metadata.distribution("flashattn-hopper") + p = dist.locate_file("flash_attn_interface.py") + print("flash_attn_interface:", p) + root = p.parent + + if not p.exists(): + raise SystemExit("flash_attn_interface not found") + if not p.is_file(): + raise SystemExit("flash_attn_interface path is not a file") + + d = root / "flashattn_hopper" + if d.exists(): + raise SystemExit(f'"{d}" already exists') + + d.mkdir(mode=0o755, parents=False, exist_ok=False) + new = d / p.name + new.symlink_to(p) + print(f"Created new symlink at {new}") + + compiled = Path(compile(new)) + + + def record_entry(path: Path) -> str: + content = path.read_bytes() + digest = b64(sha256(content).digest()).rstrip(b"=").decode() + package_path = path.relative_to(root).as_posix() + return f"{package_path},sha256={digest},{len(content):d}\r\n" + + + for f in dist.files: + if f.match("flashattn?hopper-*.dist-info/RECORD"): + with f.locate().open("a", encoding="utf-8", newline="") as record: + for added in (new, compiled): + record.write(record_entry(added)) + break +EOT diff --git a/torch/compiler_wrapper.f95 b/torch/compiler_wrapper.f95 index f8c13bd2..cbdc602e 100644 --- a/torch/compiler_wrapper.f95 +++ b/torch/compiler_wrapper.f95 @@ -1,13 +1,25 @@ +#ifndef WRAPPER_NATIVE +#define WRAPPER_NATIVE "skylake" +#endif + +#ifndef WRAPPER_CC +#define WRAPPER_CC "gcc" +#endif + +#ifndef WRAPPER_AVX +#define WRAPPER_AVX "AVX256" +#endif + PROGRAM compiler_wrapper - ! Wraps GCC invocations, - ! replacing -D__AVX512__ and -D__SCALAR__ preprocessor definitions - ! with -D__AVX256__, and -march=native with -march=skylake, + ! Wraps C compiler invocations, + ! replacing -D__AVX512__, -D__AVX256__, and -D__SCALAR__ preprocessor definitions + ! with -D____, and -march=native with -march=, ! for better reproducibility and compatibility. IMPLICIT NONE INTEGER :: i, exitcode = 0, full_length = 0, truncated = 0 CHARACTER(len=:), ALLOCATABLE :: arg, command ALLOCATE(CHARACTER(len=128) :: arg) - command = "gcc" + command = WRAPPER_CC DO i = 1, COMMAND_ARGUMENT_COUNT() DO @@ -22,9 +34,15 @@ PROGRAM compiler_wrapper END IF END DO IF (arg == "-march=native") THEN - command = command // " '-march=skylake'" - ELSE IF (arg == "-D__AVX512__" .OR. arg == "-D__SCALAR__") THEN - command = command // " '-D__AVX256__'" + command = command // (" '-march=" // WRAPPER_NATIVE // "'") + ELSE IF ( & + arg == "-D__AVX512__" & + .OR. arg == "-D__AVX256__" & + .OR. arg == "-D__SCALAR__" & + ) THEN +#ifndef WRAPPER_NO_AVX + command = command // (" '-D__" // WRAPPER_AVX // "__'") +#endif ELSE command = command // shell_escaped(arg) END IF