From 99fbe2c2225cdc4769aa921cce5f252b8f8be808 Mon Sep 17 00:00:00 2001
From: linfeng-yuan <1102311262@qq.com>
Date: Tue, 20 May 2025 19:25:46 +0800
Subject: [PATCH] [perf]: using NZ optimization for quantized GMM

Signed-off-by: linfeng-yuan <1102311262@qq.com>
---
 vllm_ascend/quantization/w8a8_dynamic.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
index 5d2b442cf1..3fe111dddf 100644
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -663,6 +663,13 @@ def process_weights_after_loading(self, layer):
                 1, 2).contiguous()
             layer.w2_weight.data = layer.w2_weight.data.transpose(
                 1, 2).contiguous()
+        # This optimization relies on the modifications in torch_npu, otherwise accuracy
+        # problem will happen. But we can evaluate the inference speed by transforming
+        # weights to NZ (29)
+        layer.w13_weight.data = torch_npu.npu_format_cast(
+            layer.w13_weight.data, 29)
+        layer.w2_weight.data = torch_npu.npu_format_cast(
+            layer.w2_weight.data, 29)
         layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
             layer.w13_weight_scale.data.shape[0], -1)
         layer.w13_weight_offset.data = layer.w13_weight_offset.data.view(