File tree Expand file tree Collapse file tree 1 file changed +4
-6
lines changed
vllm/model_executor/layers/quantization/compressed_tensors Expand file tree Collapse file tree 1 file changed +4
-6
lines changed Original file line number Diff line number Diff line change @@ -929,10 +929,8 @@ def apply(
929
929
scoring_func = scoring_func ,
930
930
e_score_correction_bias = e_score_correction_bias )
931
931
932
- a1_scale = layer .w13_input_scale
933
- a2_scale = layer .w2_input_scale
934
- per_act_token = a1_scale .numel () != 1 if a1_scale is not None else (
935
- a2_scale .numel () != 1 if a2_scale is not None else False )
932
+ per_act_token = (
933
+ self .input_quant .strategy == QuantizationStrategy .TOKEN )
936
934
937
935
if self .fused_experts is None :
938
936
# If no modular kernel is provided, use cutlass_moe_fp8
@@ -950,8 +948,8 @@ def apply(
950
948
expert_map = None if self .disable_expert_map else expert_map ,
951
949
w1_scale = layer .w13_weight_scale ,
952
950
w2_scale = layer .w2_weight_scale ,
953
- a1_scale = a1_scale ,
954
- a2_scale = a2_scale ,
951
+ a1_scale = layer . w13_input_scale ,
952
+ a2_scale = layer . w2_input_scale ,
955
953
)
956
954
else :
957
955
return self .fused_experts (
You can’t perform that action at this time.
0 commit comments