We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 5cf9ff1 commit 7f566a2Copy full SHA for 7f566a2
vllm_ascend/quantization/w8a8_dynamic.py
@@ -663,6 +663,13 @@ def process_weights_after_loading(self, layer):
663
1, 2).contiguous()
664
layer.w2_weight.data = layer.w2_weight.data.transpose(
665
666
+ # This optimization relies on the modifications in torch_npu, otherwise accuracy
667
+ # problem will happen. But we can evaluate the inference speed by transforming
668
+ # weights to NZ (29)
669
+ layer.w13_weight.data = torch_npu.npu_format_cast(
670
+ layer.w13_weight.data, 29)
671
+ layer.w2_weight.data = torch_npu.npu_format_cast(
672
+ layer.w2_weight.data , 29)
673
layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
674
layer.w13_weight_scale.data.shape[0], -1)
675
layer.w13_weight_offset.data = layer.w13_weight_offset.data.view(
0 commit comments