Skip to content

Commit 49e8c7e

Browse files
authored
Use NVCC --compress-mode to reduce binary size by 30% (#20694)
Signed-off-by: mgoin <mgoin64@gmail.com>
1 parent 805d62c commit 49e8c7e

File tree

1 file changed

+19
-12
lines changed

1 file changed

+19
-12
lines changed

CMakeLists.txt

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,13 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
171171
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
172172
endif()
173173

174+
#
175+
# Set nvcc fatbin compression.
176+
#
177+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND VLLM_GPU_LANG STREQUAL "CUDA")
178+
list(APPEND VLLM_GPU_FLAGS "-Xfatbin" "-compress-all" "-compress-mode=size")
179+
endif()
180+
174181

175182
#
176183
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
@@ -392,7 +399,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
392399
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
393400
# CUDA 12.0 or later
394401
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
395-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
402+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
396403
set(SRCS
397404
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
398405
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
@@ -408,7 +415,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
408415
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
409416
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
410417
else()
411-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
418+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
412419
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
413420
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
414421
"later if you intend on running FP8 quantized models on "
@@ -423,7 +430,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
423430
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
424431
# CUDA 12.8 or later
425432
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
426-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
433+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
427434
set(SRCS
428435
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
429436
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
@@ -437,7 +444,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
437444
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
438445
message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
439446
else()
440-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
447+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
441448
message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
442449
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
443450
"later if you intend on running FP8 quantized models on "
@@ -452,7 +459,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
452459
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
453460
# require CUDA 12.8 or later
454461
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
455-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
462+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
456463
set(SRCS
457464
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
458465
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
@@ -467,7 +474,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
467474
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
468475
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
469476
else()
470-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
477+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
471478
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
472479
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
473480
"later if you intend on running FP8 quantized models on "
@@ -510,7 +517,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
510517
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
511518
# require CUDA 12.2 or later (and only work on Hopper).
512519
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
513-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
520+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
514521
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
515522
set_gencode_flags_for_srcs(
516523
SRCS "${SRCS}"
@@ -519,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
519526
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
520527
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
521528
else()
522-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
529+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
523530
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
524531
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
525532
"if you intend on running FP8 sparse quantized models on Hopper.")
@@ -531,7 +538,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
531538

532539
# FP4 Archs and flags
533540
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
534-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
541+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
535542
set(SRCS
536543
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
537544
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
@@ -552,7 +559,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
552559

553560
# CUTLASS MLA Archs and flags
554561
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
555-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
562+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
556563
set(SRCS
557564
"csrc/attention/mla/cutlass_mla_kernels.cu")
558565
set_gencode_flags_for_srcs(
@@ -641,7 +648,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
641648
# The machete kernels only work on hopper and require CUDA 12.0 or later.
642649
# Only build Machete kernels if we are building for something compatible with sm90a
643650
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
644-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
651+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS)
645652
#
646653
# For the Machete kernels we automatically generate sources for various
647654
# preselected input type pairs and schedules.
@@ -693,7 +700,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
693700

694701
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
695702
else()
696-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
703+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
697704
AND MACHETE_ARCHS)
698705
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
699706
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "

0 commit comments

Comments
 (0)