File tree Expand file tree Collapse file tree 1 file changed +1
-17
lines changed
vllm/model_executor/layers/fused_moe Expand file tree Collapse file tree 1 file changed +1
-17
lines changed Original file line number Diff line number Diff line change @@ -136,23 +136,7 @@ def prepare(
136
136
"apply_router_weight_on_input is only implemented for topk=1" )
137
137
a1 = a1 * topk_weights .to (a1 .dtype )
138
138
139
- # Check if there is a block_shape / or if we can infer the quantization
140
- # schemes from the scales.
141
- per_token_quant = None
142
- if all ([
143
- x is None
144
- for x in [quant_config .block_shape , a1_scale , a2_scale ]
145
- ]) and quant_config .quant_dtype is not None :
146
- # Quantization required despite none of the inputs suggesting
147
- # quantization. Fallback to per_token_dynamic quant.
148
- per_token_quant = True
149
- else :
150
- per_token_quant = ((quant_config .block_shape is not None ) or
151
- (a1_scale is not None and a1_scale .numel () != 1 )
152
- or (a2_scale is not None
153
- and a2_scale .numel () != 1 ))
154
-
155
- if per_token_quant :
139
+ if quant_config .per_act_token_quant :
156
140
a1q , a1q_scale = moe_kernel_quantize_input (
157
141
a1 ,
158
142
a1_scale ,
You can’t perform that action at this time.
0 commit comments