diff --git a/.github/configurations/vllm-tensorizer.yml b/.github/configurations/vllm-tensorizer.yml new file mode 100644 index 00000000..39f2ad1e --- /dev/null +++ b/.github/configurations/vllm-tensorizer.yml @@ -0,0 +1,6 @@ +vllm-commit: + - 'b6553be1bc75f046b00046a4ad7576364d03c835' +flashinfer-commit: + - 'v0.2.6.post1' +base-image: + - 'ghcr.io/coreweave/ml-containers/torch-extras:es-compute-12.0-67208ca-nccl-cuda12.9.0-ubuntu22.04-nccl2.27.3-1-torch2.7.1-vision0.22.1-audio2.7.1-abi1' diff --git a/.github/workflows/vllm-tensorizer.yml b/.github/workflows/vllm-tensorizer.yml index 14d2c7db..b73a3fe6 100644 --- a/.github/workflows/vllm-tensorizer.yml +++ b/.github/workflows/vllm-tensorizer.yml @@ -1,9 +1,4 @@ on: - workflow_dispatch: - inputs: - commit: - description: 'Commit to build' - required: true push: paths: - "vllm-tensorizer/**" @@ -12,12 +7,22 @@ on: jobs: + get-config: + name: Get vllm-tensorizer config + uses: ./.github/workflows/read-configuration.yml + with: + path: ./.github/configurations/vllm-tensorizer.yml build: uses: ./.github/workflows/build.yml + needs: get-config + strategy: + matrix: ${{ fromJSON(needs.get-config.outputs.config) }} secrets: inherit with: image-name: vllm-tensorizer folder: vllm-tensorizer - tag-suffix: ${{ inputs.commit || '19307ba71ddeb7e1cc6aec3c1baa8b50d59c1beb'}} + tag-suffix: ${{ matrix.vllm-commit }} build-args: | - COMMIT_HASH=${{ inputs.commit || '19307ba71ddeb7e1cc6aec3c1baa8b50d59c1beb'}} \ No newline at end of file + VLLM_COMMIT=${{ matrix.vllm-commit }} + FLASHINFER_COMMIT=${{ matrix.flashinfer-commit }} + BASE_IMAGE=${{ matrix.base-image }} diff --git a/.gitignore b/.gitignore index 9d90afc5..fee2d8d4 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,6 @@ flycheck_*.el .env* .environment .environment* + +# JetBrains Idea files +.idea/ diff --git a/vllm-tensorizer/Dockerfile b/vllm-tensorizer/Dockerfile index 9c4df618..8ce6ad15 100644 --- a/vllm-tensorizer/Dockerfile +++ b/vllm-tensorizer/Dockerfile @@ -1,58 +1,64 @@ -ARG BASE_IMAGE="ghcr.io/coreweave/ml-containers/torch-extras:es-22.04-58a49a2-base-cuda12.1.1-torch2.1.2-vision0.16.2-audio2.1.2-flash_attn2.4.2" - -FROM scratch as freezer +ARG BASE_IMAGE="ghcr.io/coreweave/ml-containers/torch-extras:es-compute-12.0-67208ca-nccl-cuda12.9.0-ubuntu22.04-nccl2.27.3-1-torch2.7.1-vision0.22.1-audio2.7.1-abi1" +FROM scratch AS freezer WORKDIR / COPY --chmod=755 freeze.sh / -FROM ${BASE_IMAGE} as builder-base - -ARG MAX_JOBS="" - -# Dependencies requiring NVCC are built ahead of time in a separate stage -# so that the ~2 GiB dev library installations don't have to be included -# in the final image. -RUN export \ - CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f1) \ - CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | cut -d. -f2) && \ - export \ - CUDA_PACKAGE_VERSION="${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}" && \ - apt-get -qq update && apt-get install -y --no-install-recommends \ - cuda-nvcc-${CUDA_PACKAGE_VERSION} \ - cuda-nvml-dev-${CUDA_PACKAGE_VERSION} \ - libcurand-dev-${CUDA_PACKAGE_VERSION} \ - libcublas-dev-${CUDA_PACKAGE_VERSION} \ - libcusparse-dev-${CUDA_PACKAGE_VERSION} \ - libcusolver-dev-${CUDA_PACKAGE_VERSION} \ - cuda-nvprof-${CUDA_PACKAGE_VERSION} \ - cuda-profiler-api-${CUDA_PACKAGE_VERSION} \ - libaio-dev \ - ninja-build && \ - apt-get clean +FROM ${BASE_IMAGE} AS builder-base + +ARG MAX_JOBS="16" RUN ldconfig RUN apt-get -qq update && \ apt-get -qq install -y --no-install-recommends \ - python3-pip git ninja-build && \ + python3-pip git ninja-build cmake && \ apt-get clean && \ - pip3 install -U --no-cache-dir pip packaging setuptools wheel + pip3 install -U --no-cache-dir pip packaging setuptools wheel setuptools_scm regex + +# Create the /wheels directory +WORKDIR /wheels + +WORKDIR /workspace + -FROM alpine/git:2.36.3 as vllm-downloader +FROM alpine/git:2.36.3 AS vllm-downloader WORKDIR /git -ARG COMMIT_HASH -RUN git clone --filter=blob:none --depth 1 --no-single-branch --no-checkout \ - https://github.com/coreweave/vllm.git && \ +ARG VLLM_COMMIT +RUN git clone --filter=tree:0 --no-single-branch --no-checkout \ + https://github.com/vllm-project/vllm && \ cd vllm && \ - git checkout "${COMMIT_HASH}" && \ + git checkout "${VLLM_COMMIT}" && \ git submodule update --init --recursive --jobs 8 \ - --depth 1 --filter=blob:none + --depth 1 --filter=tree:0 -FROM builder-base as vllm-builder -WORKDIR /workspace + +FROM alpine/git:2.36.3 AS flashinfer-downloader +WORKDIR /git +ARG FLASHINFER_COMMIT +RUN git clone --filter=tree:0 --no-single-branch --no-checkout \ + https://github.com/flashinfer-ai/flashinfer && \ + cd flashinfer && \ + git checkout "${FLASHINFER_COMMIT}" && \ + git submodule update --init --recursive --jobs 8 \ + --depth 1 --filter=tree:0 + + +FROM builder-base AS vllm-builder RUN --mount=type=bind,from=vllm-downloader,source=/git/vllm,target=/workspace,rw \ --mount=type=bind,from=freezer,target=/tmp/frozen,rw \ /tmp/frozen/freeze.sh torch torchaudio torchvision xformers > /tmp/frozen/constraints.txt && \ - LIBRARY_PATH="/usr/local/cuda/lib64/stubs${LIBRARY_PATH:+:$LIBRARY_PATH}" \ + if [ -z "$MAX_JOBS" ]; then unset MAX_JOBS; fi && \ + python3 -m pip install --no-cache-dir py-cpuinfo && \ + if [ -f 'use_existing_torch.py' ]; then \ + python3 use_existing_torch.py; \ + else \ + git cat-file blob \ + e489ad7a210f4234db696d1f2749d5f3662fa65b:use_existing_torch.py \ + | python3 -; \ + fi && \ + USE_CUDNN=1 USE_CUSPARSELT=1 \ + LIBRARY_PATH="/usr/local/cuda/lib64:${LIBRARY_PATH:+:$LIBRARY_PATH}" \ + CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \ python3 -m pip wheel -w /wheels \ -v --no-cache-dir --no-build-isolation --no-deps \ -c /tmp/frozen/constraints.txt \ @@ -60,22 +66,48 @@ RUN --mount=type=bind,from=vllm-downloader,source=/git/vllm,target=/workspace,rw WORKDIR /wheels -FROM ${BASE_IMAGE} as base + +FROM builder-base AS flashinfer-builder +RUN --mount=type=bind,from=flashinfer-downloader,source=/git/flashinfer,target=/workspace,rw \ + --mount=type=bind,from=freezer,target=/tmp/frozen,rw \ + /tmp/frozen/freeze.sh torch torchaudio torchvision xformers > /tmp/frozen/constraints.txt && \ + export TORCH_CUDA_ARCH_LIST="$(echo "${TORCH_CUDA_ARCH_LIST}" | sed 's@[67]\.0 \+@@g')" && \ + python3 -m flashinfer.aot && \ + python3 -m pip wheel -w /wheels \ + -v --no-cache-dir --no-build-isolation --no-deps \ + -c /tmp/frozen/constraints.txt \ + ./ + +WORKDIR /wheels + + +FROM ${BASE_IMAGE} AS base WORKDIR /workspace -RUN apt-get -qq update && apt-get install -y --no-install-recommends curl && apt-get clean +RUN apt-get -qq update && apt-get install -y --no-install-recommends curl libsodium23 && apt-get clean RUN --mount=type=bind,from=freezer,target=/tmp/frozen \ /tmp/frozen/freeze.sh torch torchaudio torchvision xformers > /tmp/constraints.txt -RUN python3 -m pip install --no-cache-dir \ - "fschat[model_worker] == 0.2.30" "triton == 2.1.0" \ - -c /tmp/constraints.txt - RUN --mount=type=bind,from=vllm-builder,source=/wheels,target=/tmp/wheels \ - python3 -m pip install --no-cache-dir /tmp/wheels/*.whl -c /tmp/constraints.txt && \ + python3 -m pip install --no-cache-dir "$(printf '%s[tensorizer]' /tmp/wheels/*.whl)" -c /tmp/constraints.txt + +RUN --mount=type=bind,from=flashinfer-builder,source=/wheels,target=/tmp/wheels \ + python3 -m pip install --no-cache-dir /tmp/wheels/*.whl -c /tmp/constraints.txt + +# Copied from vLLM's Dockerfile +ARG TARGETPLATFORM + +RUN if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + python3 -m pip install --no-cache-dir \ + accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' \ + boto3 runai-model-streamer runai-model-streamer[s3] -c /tmp/constraints.txt; \ + else \ + python3 -m pip install --no-cache-dir \ + accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' \ + boto3 runai-model-streamer runai-model-streamer[s3] -c /tmp/constraints.txt; \ + fi && \ rm /tmp/constraints.txt - -EXPOSE 8080 \ No newline at end of file +EXPOSE 8080