@@ -171,6 +171,13 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
171
171
list (APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS} " )
172
172
endif ()
173
173
174
+ #
175
+ # Set nvcc fatbin compression.
176
+ #
177
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND VLLM_GPU_LANG STREQUAL "CUDA" )
178
+ list (APPEND VLLM_GPU_FLAGS "-Xfatbin" "-compress-all" "-compress-mode=size" )
179
+ endif ()
180
+
174
181
175
182
#
176
183
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
@@ -392,7 +399,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
392
399
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
393
400
# CUDA 12.0 or later
394
401
cuda_archs_loose_intersection (SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS} " )
395
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS )
402
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS )
396
403
set (SRCS
397
404
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
398
405
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
@@ -408,7 +415,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
408
415
list (APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS} " )
409
416
message (STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS} " )
410
417
else ()
411
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS )
418
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS )
412
419
message (STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
413
420
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
414
421
"later if you intend on running FP8 quantized models on "
@@ -423,7 +430,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
423
430
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
424
431
# CUDA 12.8 or later
425
432
cuda_archs_loose_intersection (SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS} " )
426
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS )
433
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS )
427
434
set (SRCS
428
435
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
429
436
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
@@ -437,7 +444,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
437
444
list (APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS} " )
438
445
message (STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS} " )
439
446
else ()
440
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS )
447
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS )
441
448
message (STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
442
449
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
443
450
"later if you intend on running FP8 quantized models on "
@@ -452,7 +459,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
452
459
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
453
460
# require CUDA 12.8 or later
454
461
cuda_archs_loose_intersection (SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS} " )
455
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS )
462
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS )
456
463
set (SRCS
457
464
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
458
465
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
@@ -467,7 +474,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
467
474
list (APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS} " )
468
475
message (STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS} " )
469
476
else ()
470
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS )
477
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS )
471
478
message (STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
472
479
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
473
480
"later if you intend on running FP8 quantized models on "
@@ -510,7 +517,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
510
517
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
511
518
# require CUDA 12.2 or later (and only work on Hopper).
512
519
cuda_archs_loose_intersection (SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS} " )
513
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS )
520
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS )
514
521
set (SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu" )
515
522
set_gencode_flags_for_srcs (
516
523
SRCS "${SRCS} "
@@ -519,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
519
526
list (APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1" )
520
527
message (STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS} " )
521
528
else ()
522
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS )
529
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS )
523
530
message (STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
524
531
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
525
532
"if you intend on running FP8 sparse quantized models on Hopper." )
@@ -531,7 +538,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
531
538
532
539
# FP4 Archs and flags
533
540
cuda_archs_loose_intersection (FP4_ARCHS "10.0a" "${CUDA_ARCHS} " )
534
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS )
541
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS )
535
542
set (SRCS
536
543
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
537
544
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
@@ -552,7 +559,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
552
559
553
560
# CUTLASS MLA Archs and flags
554
561
cuda_archs_loose_intersection (MLA_ARCHS "10.0a" "${CUDA_ARCHS} " )
555
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS )
562
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS )
556
563
set (SRCS
557
564
"csrc/attention/mla/cutlass_mla_kernels.cu" )
558
565
set_gencode_flags_for_srcs (
@@ -641,7 +648,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
641
648
# The machete kernels only work on hopper and require CUDA 12.0 or later.
642
649
# Only build Machete kernels if we are building for something compatible with sm90a
643
650
cuda_archs_loose_intersection (MACHETE_ARCHS "9.0a" "${CUDA_ARCHS} " )
644
- if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS )
651
+ if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS )
645
652
#
646
653
# For the Machete kernels we automatically generate sources for various
647
654
# preselected input type pairs and schedules.
@@ -693,7 +700,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
693
700
694
701
message (STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS} " )
695
702
else ()
696
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
703
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
697
704
AND MACHETE_ARCHS )
698
705
message (STATUS "Not building Machete kernels as CUDA Compiler version is "
699
706
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
0 commit comments