handle code clean

weijinqian_v1 · weijinqian_v1 · commit 5956ef0da1ee · 2025-07-09T23:34:36.000+08:00
Signed-off-by: weijinqian_v1 &lt;weijinqian@huawei.com&gt;
diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py
@@ -54,11 +54,12 @@ def test_run_with_ascend_config():
             # torchair graph only works with deepseek. The e2e test should be added
             # in multicard test with deepseek models.
             "enabled": False,
-            "use_cached_graph": True,
-            "graph_batch_sizes": [1, 2, 4, 8],
+            "use_cached_graph": False,
+            "graph_batch_sizes": [],
             "graph_batch_sizes_init": False,
-            "enable_multistream_moe": True,
-            "enable_multistream_mla": True,
+            "enable_multistream_moe": False,
+            "enable_multistream_mla": False,
+            "enable_view_optimize": False,
         },
         "ascend_scheduler_config": {
             "enabled": True,
@@ -73,13 +74,12 @@ def test_run_with_ascend_config():
         ascend_config = get_ascend_config()
 
         assert not ascend_config.torchair_graph_config.enabled
-        assert ascend_config.torchair_graph_config.use_cached_graph
-        assert ascend_config.torchair_graph_config.graph_batch_sizes == [
-            1, 2, 4, 8
-        ]
+        assert not ascend_config.torchair_graph_config.use_cached_graph
+        assert ascend_config.torchair_graph_config.graph_batch_sizes == []
         assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
-        assert ascend_config.torchair_graph_config.enable_multistream_mla
-        assert ascend_config.torchair_graph_config.enable_multistream_moe
+        assert not ascend_config.torchair_graph_config.enable_multistream_mla
+        assert not ascend_config.torchair_graph_config.enable_multistream_moe
+        assert not ascend_config.torchair_graph_config.enable_view_optimize
         assert ascend_config.ascend_scheduler_config.enabled
         assert ascend_config.ascend_scheduler_config.enable_chunked_prefill
 
@@ -142,6 +142,58 @@ def test_ascend_config_load_error():
                         additional_config=input_additional_config_fake_3):
             pass
 
+    # use_cached_graph should not be enabled without torchair graph mode
+    with pytest.raises(RuntimeError):
+        input_additional_config_fake_4 = {
+            "torchair_graph_config": {
+                "enabled": False,
+                "use_cached_graph": True,
+            },
+        }
+        with VllmRunner("facebook/opt-125m",
+                        enforce_eager=True,
+                        additional_config=input_additional_config_fake_4):
+            pass
+
+    # graph_batch_sizes_init should not be enabled without torchair graph mode
+    with pytest.raises(RuntimeError):
+        input_additional_config_fake_5 = {
+            "torchair_graph_config": {
+                "enabled": False,
+                "graph_batch_sizes_init": True,
+            },
+        }
+        with VllmRunner("facebook/opt-125m",
+                        enforce_eager=True,
+                        additional_config=input_additional_config_fake_5):
+            pass
+
+    # enable_multistream_mla should not be enabled without torchair graph mode
+    with pytest.raises(RuntimeError):
+        input_additional_config_fake_6 = {
+            "torchair_graph_config": {
+                "enabled": False,
+                "enable_multistream_mla": True,
+            },
+        }
+        with VllmRunner("facebook/opt-125m",
+                        enforce_eager=True,
+                        additional_config=input_additional_config_fake_6):
+            pass
+
+    # enable_multistream_moe should not be enabled without torchair graph mode
+    with pytest.raises(RuntimeError):
+        input_additional_config_fake_7 = {
+            "torchair_graph_config": {
+                "enabled": False,
+                "enable_multistream_moe": True,
+            },
+        }
+        with VllmRunner("facebook/opt-125m",
+                        enforce_eager=True,
+                        additional_config=input_additional_config_fake_7):
+            pass
+
 
 @_clean_up_ascend_config
 def test_check_ascend_config_v0():
@@ -168,9 +220,7 @@ def test_ascend_config_refresh():
     input_additional_config = {
         "torchair_graph_config": {
             "enabled": False,
-            "use_cached_graph": True,
-            "graph_batch_sizes": [1, 2, 4, 8],
-            "graph_batch_sizes_init": False,
+            "enable_view_optimize": False
         },
         "refresh": True,
     }
@@ -180,9 +230,4 @@ def test_ascend_config_refresh():
                     additional_config=input_additional_config):
         ascend_config = get_ascend_config()
 
-        assert not ascend_config.torchair_graph_config.enabled
-        assert ascend_config.torchair_graph_config.use_cached_graph
-        assert ascend_config.torchair_graph_config.graph_batch_sizes == [
-            1, 2, 4, 8
-        ]
-        assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
+        assert not ascend_config.torchair_graph_config.enable_view_optimize
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -70,6 +70,31 @@ def __init__(self, torchair_graph_config):
             raise ValueError(
                 "graph_batch_sizes_init is only valid when graph_batch_sizes is empty"
             )
+        if not self.enabled:
+            if self.use_cached_graph:
+                raise RuntimeError(
+                    "use_cached_graph is valid only when Torchair graph mode is enabled"
+                )
+            if self.graph_batch_sizes:
+                raise RuntimeError(
+                    "graph_batch_sizes is valid only when Torchair graph mode is enabled"
+                )
+            if self.graph_batch_sizes_init:
+                raise RuntimeError(
+                    "graph_batch_sizes_init is valid only when Torchair graph mode is enabled"
+                )
+            if self.enable_multistream_mla:
+                raise RuntimeError(
+                    "enable_multistream_mla is valid only when Torchair graph mode is enabled"
+                )
+            if self.enable_multistream_moe:
+                raise RuntimeError(
+                    "enable_multistream_moe is valid only when Torchair graph mode is enabled"
+                )
+            if self.enable_kv_nz:
+                raise RuntimeError(
+                    "enable_kv_nz is valid only when Torchair graph mode is enabled"
+                )
 
 
 class AscendSchedulerConfig:
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -352,13 +352,13 @@ def build_torchair_graph_dummy(
         else:
             attn_state = AscendAttentionState.DecodeOnly
             num_decode_tokens = 1
-        sin = torch.ones(num_reqs,
+        sin = torch.ones(num_tokens,
                          1,
                          1,
                          self.rope_dim,
                          dtype=self.runner.dtype,
                          device=device)
-        cos = torch.ones(num_reqs,
+        cos = torch.ones(num_tokens,
                          1,
                          1,
                          self.rope_dim,
@@ -547,15 +547,13 @@ def build(
                 actual_seq_q_lens = query_start_loc[1:].tolist(
                 ) + self.runner.actual_seq_q_lens[num_reqs:num_reqs +
                                                   num_reqs_pad_size]
-                cos = self.cos_cache[
-                    input_positions].unsqueeze(  # type: ignore
-                        1).unsqueeze(2)
-                sin = self.sin_cache[
-                    input_positions].unsqueeze(  # type: ignore
-                        1).unsqueeze(2)
             else:
                 seq_lens_list = seq_lens.tolist()
-                cos, sin = None, None
+
+            cos = self.cos_cache[input_positions].unsqueeze(  # type: ignore
+                1).unsqueeze(2)
+            sin = self.sin_cache[input_positions].unsqueeze(  # type: ignore
+                1).unsqueeze(2)
             mc2_mask = self.generate_activate_mask(
                 num_actual_tokens, num_reqs + num_reqs_pad_size)
 
diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py
@@ -12,6 +12,7 @@ def register_model():
         AscendQwen2_5_VLForConditionalGeneration  # noqa: F401
     from .qwen2_vl import AscendQwen2VLForConditionalGeneration  # noqa: F401
     from .moe_block import AscendSparseMoeBlock # noqa: F401
+    from .qwen3 import CustomQwen3ForCausalLM  # noqa: F401
 
     ModelRegistry.register_model(
         "DeepSeekMTPModel",
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -236,7 +236,8 @@ def __init__(
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
         self.enable_multistream_moe = \
-            ascend_config.torchair_graph_config.enable_multistream_moe
+            ascend_config.torchair_graph_config.enable_multistream_moe and \
+            self.torchair_graph_enabled
 
         self.gate = ReplicatedLinear(config.hidden_size,
                                      config.n_routed_experts,
@@ -462,7 +463,8 @@ def __init__(
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
         self.enable_multistream_mla = \
-            ascend_config.torchair_graph_config.enable_multistream_mla
+            ascend_config.torchair_graph_config.enable_multistream_mla and \
+            self.torchair_graph_enabled
 
     def forward(
             self,
diff --git a/vllm_ascend/models/qwen3.py b/vllm_ascend/models/qwen3.py
@@ -0,0 +1,156 @@
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import Qwen3Config
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
+from vllm.model_executor.models.qwen2 import Qwen2Model
+from vllm.model_executor.models.qwen3 import Qwen3DecoderLayer
+from vllm.model_executor.models.utils import (AutoWeightsLoader,
+                                              PPMissingLayer, maybe_prefix)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from vllm_ascend.ops.layernorm import AddRMSNormQuant
+
+
+class CustomQwen3DecoderLayer(Qwen3DecoderLayer):
+
+    def __init__(
+        self,
+        config: Qwen3Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config,
+                         cache_config=cache_config,
+                         quant_config=quant_config,
+                         prefix=prefix)
+        if quant_config is None:
+            return
+
+        from vllm_ascend.quantization.quant_config import AscendQuantConfig
+        from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
+
+        assert isinstance(quant_config, AscendQuantConfig), \
+            "Expected quant_config to be an instance of AscendQuantConfig"
+
+        if isinstance(self.self_attn.qkv_proj.quant_method,
+                      AscendW8A8LinearMethod):
+            self.input_layernorm = AddRMSNormQuant(
+                config.hidden_size,
+                layer=self.self_attn.qkv_proj,
+                eps=config.rms_norm_eps)
+        if isinstance(self.mlp.gate_up_proj.quant_method,
+                      AscendW8A8LinearMethod):
+            self.post_attention_layernorm = AddRMSNormQuant(
+                config.hidden_size,
+                layer=self.mlp.gate_up_proj,
+                eps=config.rms_norm_eps)
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": CustomQwen3DecoderLayer,
+}
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    })
+class CustomQwen3Model(Qwen2Model):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         decoder_layer_type=CustomQwen3DecoderLayer)
+
+
+class CustomQwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    # add `CustomQwen3Model` to init self.model
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = CustomQwen3Model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(config.vocab_size,
+                                              config.hidden_size,
+                                              quant_config=quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -1110,7 +1110,8 @@ def __init__(
 
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
         self.enable_multistream_moe = \
-            ascend_config.torchair_graph_config.enable_multistream_moe
+            ascend_config.torchair_graph_config.enable_multistream_moe and \
+            self.torchair_graph_enabled
 
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py
diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py