File tree Expand file tree Collapse file tree 3 files changed +26
-1
lines changed
model_executor/layers/quantization/compressed_tensors Expand file tree Collapse file tree 3 files changed +26
-1
lines changed Original file line number Diff line number Diff line change 133
133
VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS : int = 300
134
134
VLLM_KV_CACHE_LAYOUT : Optional [str ] = None
135
135
VLLM_COMPUTE_NANS_IN_LOGITS : bool = False
136
+ VLLM_USE_NVFP4_CT_EMULATIONS : bool = False
136
137
137
138
138
139
def get_default_cache_root ():
@@ -918,6 +919,12 @@ def get_vllm_port() -> Optional[int]:
918
919
# or bad hardware but it may add compute overhead.
919
920
"VLLM_COMPUTE_NANS_IN_LOGITS" :
920
921
lambda : bool (int (os .getenv ("VLLM_COMPUTE_NANS_IN_LOGITS" , "0" ))),
922
+
923
+ # Controls whether or not emulations are used for NVFP4
924
+ # generations on machines < 100 for compressed-tensors
925
+ # models
926
+ "VLLM_USE_NVFP4_CT_EMULATIONS" :
927
+ lambda : bool (int (os .getenv ("VLLM_USE_NVFP4_CT_EMULATIONS" , "0" )))
921
928
}
922
929
923
930
# --8<-- [end:env-vars-definition]
Original file line number Diff line number Diff line change 13
13
QuantizationType )
14
14
from pydantic import BaseModel
15
15
16
+ import vllm .envs as envs
16
17
from vllm .logger import init_logger
17
18
from vllm .model_executor .layers .fused_moe import FusedMoE
18
19
from vllm .model_executor .layers .linear import (LinearBase , LinearMethodBase ,
@@ -374,7 +375,8 @@ def _get_scheme_from_parts(
374
375
375
376
if is_activation_quantization_format (self .quant_format ):
376
377
if self ._is_fp4a4_nvfp4 (weight_quant , input_quant ):
377
- if CompressedTensorsW4A4Fp4 .cutlass_fp4_supported ():
378
+ if CompressedTensorsW4A4Fp4 .cutlass_fp4_supported (
379
+ ) or envs .VLLM_USE_NVFP4_CT_EMULATIONS :
378
380
return CompressedTensorsW4A4Fp4 ()
379
381
else :
380
382
logger .warning_once (
Original file line number Diff line number Diff line change 4
4
import torch
5
5
from torch .nn .parameter import Parameter
6
6
7
+ import vllm .envs as envs
7
8
from vllm ._custom_ops import (cutlass_scaled_fp4_mm ,
8
9
cutlass_scaled_mm_supports_fp4 , scaled_fp4_quant )
9
10
from vllm .logger import init_logger
10
11
from vllm .model_executor .layers .quantization .compressed_tensors .schemes import (
11
12
CompressedTensorsScheme )
13
+ from vllm .model_executor .layers .quantization .utils .nvfp4_emulation_utils import ( # noqa: E501
14
+ run_nvfp4_emulations )
12
15
from vllm .model_executor .parameter import (GroupQuantScaleParameter ,
13
16
ModelWeightParameter ,
14
17
PerTensorScaleParameter )
@@ -26,6 +29,8 @@ def __init__(self):
26
29
27
30
@classmethod
28
31
def get_min_capability (cls ) -> int :
32
+ if envs .VLLM_USE_NVFP4_CT_EMULATIONS :
33
+ return 80
29
34
return 100
30
35
31
36
@classmethod
@@ -129,6 +134,17 @@ def apply_weights(self,
129
134
x : torch .Tensor ,
130
135
bias : Optional [torch .Tensor ] = None ) -> torch .Tensor :
131
136
137
+ if envs .VLLM_USE_NVFP4_CT_EMULATIONS :
138
+ out = run_nvfp4_emulations (
139
+ x = x ,
140
+ input_global_scale = layer .input_global_scale ,
141
+ weight = layer .weight ,
142
+ weight_scale_swizzled = layer .weight_scale_swizzled ,
143
+ weight_global_scale = layer .weight_global_scale )
144
+ if bias is not None :
145
+ out = out + bias
146
+ return out
147
+
132
148
output_dtype = x .dtype
133
149
output_shape = [x .shape [0 ], layer .weight .shape [0 ]]
134
150
You can’t perform that action at this time.
0 commit comments