@@ -388,48 +388,33 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
388
388
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
389
389
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
390
390
391
- # Allow specifying a version, Git revision or local .whl file
392
- ARG FLASHINFER_CUDA128_INDEX_URL="https://download.pytorch.org/whl/cu128/flashinfer"
393
- ARG FLASHINFER_CUDA128_WHEEL="flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl"
391
+ # Install FlashInfer from source
394
392
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
395
393
ARG FLASHINFER_GIT_REF="v0.2.8rc1"
396
- # Flag to control whether to use pre-built FlashInfer wheels (set to false to force build from source)
397
- # TODO: Currently disabled because the pre-built wheels are not available for FLASHINFER_GIT_REF
398
- ARG USE_FLASHINFER_PREBUILT_WHEEL=false
399
394
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
400
395
. /etc/environment
401
- if [ "$TARGETPLATFORM" != "linux/arm64" ]; then
402
- # FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
403
- if [[ "$CUDA_VERSION" == 12.8* ]] && [[ "$USE_FLASHINFER_PREBUILT_WHEEL" == "true" ]]; then
404
- uv pip install --system ${FLASHINFER_CUDA128_INDEX_URL}/${FLASHINFER_CUDA128_WHEEL}
405
- else
406
- # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
407
- # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
408
- if [[ "${CUDA_VERSION}" == 11.* ]]; then
409
- FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
410
- elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
411
- FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
412
- else
413
- # CUDA 12.8+ supports 10.0a and 12.0
414
- FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
415
- fi
416
- echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
417
-
418
- git clone --depth 1 --recursive --shallow-submodules \
419
- --branch ${FLASHINFER_GIT_REF} \
420
- ${FLASHINFER_GIT_REPO} flashinfer
421
-
422
- # Needed to build AOT kernels
423
- pushd flashinfer
424
- TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
425
- python3 -m flashinfer.aot
426
- TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
427
- uv pip install --system --no-build-isolation .
428
- popd
429
-
430
- rm -rf flashinfer
431
- fi \
432
- fi
396
+ git clone --depth 1 --recursive --shallow-submodules \
397
+ --branch ${FLASHINFER_GIT_REF} \
398
+ ${FLASHINFER_GIT_REPO} flashinfer
399
+ # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
400
+ # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
401
+ if [[ "${CUDA_VERSION}" == 11.* ]]; then
402
+ FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
403
+ elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
404
+ FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
405
+ else
406
+ # CUDA 12.8+ supports 10.0a and 12.0
407
+ FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
408
+ fi
409
+ echo "🏗️ Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
410
+ # Needed to build AOT kernels
411
+ pushd flashinfer
412
+ TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
413
+ python3 -m flashinfer.aot
414
+ TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
415
+ uv pip install --system --no-build-isolation .
416
+ popd
417
+ rm -rf flashinfer
433
418
BASH
434
419
COPY examples examples
435
420
COPY benchmarks benchmarks
@@ -521,10 +506,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
521
506
uv pip install --system -r requirements/kv_connectors.txt; \
522
507
fi; \
523
508
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
524
- uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0 .42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3] ; \
509
+ BITSANDBYTES_VERSION= "0 .42.0" ; \
525
510
else \
526
- uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.46.1' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
527
- fi
511
+ BITSANDBYTES_VERSION="0.46.1" ; \
512
+ fi; \
513
+ uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]
528
514
529
515
ENV VLLM_USAGE_SOURCE production-docker-image
530
516
0 commit comments