Skip to content

Commit 4e96968

Browse files
authored
[Quantization] Add compressed-tensors emulations support for NVFP4 (vllm-project#19879)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com> Signed-off-by: Dipika <dipikasikka1@gmail.com>
1 parent 6639ff1 commit 4e96968

File tree

3 files changed

+26
-1
lines changed

3 files changed

+26
-1
lines changed

vllm/envs.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@
133133
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: int = 300
134134
VLLM_KV_CACHE_LAYOUT: Optional[str] = None
135135
VLLM_COMPUTE_NANS_IN_LOGITS: bool = False
136+
VLLM_USE_NVFP4_CT_EMULATIONS: bool = False
136137

137138

138139
def get_default_cache_root():
@@ -918,6 +919,12 @@ def get_vllm_port() -> Optional[int]:
918919
# or bad hardware but it may add compute overhead.
919920
"VLLM_COMPUTE_NANS_IN_LOGITS":
920921
lambda: bool(int(os.getenv("VLLM_COMPUTE_NANS_IN_LOGITS", "0"))),
922+
923+
# Controls whether or not emulations are used for NVFP4
924+
# generations on machines < 100 for compressed-tensors
925+
# models
926+
"VLLM_USE_NVFP4_CT_EMULATIONS":
927+
lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0")))
921928
}
922929

923930
# --8<-- [end:env-vars-definition]

vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
QuantizationType)
1414
from pydantic import BaseModel
1515

16+
import vllm.envs as envs
1617
from vllm.logger import init_logger
1718
from vllm.model_executor.layers.fused_moe import FusedMoE
1819
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
@@ -374,7 +375,8 @@ def _get_scheme_from_parts(
374375

375376
if is_activation_quantization_format(self.quant_format):
376377
if self._is_fp4a4_nvfp4(weight_quant, input_quant):
377-
if CompressedTensorsW4A4Fp4.cutlass_fp4_supported():
378+
if CompressedTensorsW4A4Fp4.cutlass_fp4_supported(
379+
) or envs.VLLM_USE_NVFP4_CT_EMULATIONS:
378380
return CompressedTensorsW4A4Fp4()
379381
else:
380382
logger.warning_once(

vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,14 @@
44
import torch
55
from torch.nn.parameter import Parameter
66

7+
import vllm.envs as envs
78
from vllm._custom_ops import (cutlass_scaled_fp4_mm,
89
cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
910
from vllm.logger import init_logger
1011
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
1112
CompressedTensorsScheme)
13+
from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import ( # noqa: E501
14+
run_nvfp4_emulations)
1215
from vllm.model_executor.parameter import (GroupQuantScaleParameter,
1316
ModelWeightParameter,
1417
PerTensorScaleParameter)
@@ -26,6 +29,8 @@ def __init__(self):
2629

2730
@classmethod
2831
def get_min_capability(cls) -> int:
32+
if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
33+
return 80
2934
return 100
3035

3136
@classmethod
@@ -129,6 +134,17 @@ def apply_weights(self,
129134
x: torch.Tensor,
130135
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
131136

137+
if envs.VLLM_USE_NVFP4_CT_EMULATIONS:
138+
out = run_nvfp4_emulations(
139+
x=x,
140+
input_global_scale=layer.input_global_scale,
141+
weight=layer.weight,
142+
weight_scale_swizzled=layer.weight_scale_swizzled,
143+
weight_global_scale=layer.weight_global_scale)
144+
if bias is not None:
145+
out = out + bias
146+
return out
147+
132148
output_dtype = x.dtype
133149
output_shape = [x.shape[0], layer.weight.shape[0]]
134150

0 commit comments

Comments
 (0)