[0.9.1][BugFix] Fix the failure to recognize the actual type of quantization (#1721)

rjg-lyh · web-flow · commit 8e42f71a2f43 · 2025-07-10T17:11:09.000+08:00
### What this PR does / why we need it?
Fix the failure to recognize the actual type of quantization in
layernorm, which causes the expected branch not to be executed.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
CI passed with new added/existing test.

Signed-off-by: rjg-lyh &lt;1318825571@qq.com&gt;
diff --git a/vllm_ascend/models/qwen3.py b/vllm_ascend/models/qwen3.py
@@ -18,7 +18,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from vllm_ascend.ops.layernorm import AddRMSNormQuant
+from vllm_ascend.ops.layernorm import AddRMSNormW8A8Quant
 
 
 class CustomQwen3DecoderLayer(Qwen3DecoderLayer):
@@ -43,15 +43,15 @@ def __init__(
         assert isinstance(quant_config, AscendQuantConfig), \
             "Expected quant_config to be an instance of AscendQuantConfig"
 
-        if isinstance(self.self_attn.qkv_proj.quant_method,
+        if isinstance(self.self_attn.qkv_proj.quant_method.quant_method,
                       AscendW8A8LinearMethod):
-            self.input_layernorm = AddRMSNormQuant(
+            self.input_layernorm = AddRMSNormW8A8Quant(
                 config.hidden_size,
                 layer=self.self_attn.qkv_proj,
                 eps=config.rms_norm_eps)
-        if isinstance(self.mlp.gate_up_proj.quant_method,
+        if isinstance(self.mlp.gate_up_proj.quant_method.quant_method,
                       AscendW8A8LinearMethod):
-            self.post_attention_layernorm = AddRMSNormQuant(
+            self.post_attention_layernorm = AddRMSNormW8A8Quant(
                 config.hidden_size,
                 layer=self.mlp.gate_up_proj,
                 eps=config.rms_norm_eps)
diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py
@@ -21,12 +21,8 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 
 
-class AddRMSNormQuant(RMSNorm):
-    """Root mean square normalization.
-
-    Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
-    Refer to https://arxiv.org/abs/1910.07467
-    """
+class AddRMSNormW8A8Quant(RMSNorm):
+    # Fuse AddRmsNorm and W8A8 quantization ops together
 
     def __init__(
         self,