From 99fbe2c2225cdc4769aa921cce5f252b8f8be808 Mon Sep 17 00:00:00 2001 From: linfeng-yuan <1102311262@qq.com> Date: Tue, 20 May 2025 19:25:46 +0800 Subject: [PATCH] [perf]: using NZ optimization for quantized GMM Signed-off-by: linfeng-yuan <1102311262@qq.com> --- vllm_ascend/quantization/w8a8_dynamic.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 5d2b442cf..3fe111ddd 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -663,6 +663,13 @@ def process_weights_after_loading(self, layer): 1, 2).contiguous() layer.w2_weight.data = layer.w2_weight.data.transpose( 1, 2).contiguous() + # This optimization relies on the modifications in torch_npu, otherwise accuracy + # problem will happen. But we can evaluate the inference speed by transforming + # weights to NZ (29) + layer.w13_weight.data = torch_npu.npu_format_cast( + layer.w13_weight.data, 29) + layer.w2_weight.data = torch_npu.npu_format_cast( + layer.w2_weight.data, 29) layer.w13_weight_scale.data = layer.w13_weight_scale.data.view( layer.w13_weight_scale.data.shape[0], -1) layer.w13_weight_offset.data = layer.w13_weight_offset.data.view(