add moe_block: AscendSparseMoeBlock

weijinqian_v1 · weijinqian_v1 · commit a9bccf85594c · 2025-07-09T16:25:22.000+08:00
Signed-off-by: weijinqian_v1 &lt;weijinqian@huawei.com&gt;
diff --git a/vllm_ascend/models/moe_block.py b/vllm_ascend/models/moe_block.py
@@ -18,6 +18,8 @@
 from typing import Optional
 
 import torch
+import vllm.model_executor.models.qwen3_moe as qwen3
+
 from torch import nn
 from vllm.attention import AttentionMetadata
 from vllm.distributed import (get_tensor_model_parallel_world_size,
@@ -91,13 +93,12 @@ def forward(
             attn_metadata = get_forward_context().attn_metadata
         # when profile runs, force experts to load balanced tokens
         # to avoid high memory consumption on a single rank.
-        is_prefill = True
         if attn_metadata is None:
             # for profile run
             is_prefill = True
             enable_force_load_balance = True
         else:
-            # is_prefill = attn_metadata.num_prefills > 0 is_prefill or
+            is_prefill = False
             enable_force_load_balance = False
             if hasattr(attn_metadata, 'with_prefill_across_dp'):
                 is_prefill = attn_metadata.with_prefill_across_dp
@@ -114,4 +115,6 @@ def forward(
             shared_experts=None,
         )
 
-        return hidden_states
+        return hidden_states
+
+qwen3.Qwen3MoeSparseMoeBlock = AscendSparseMoeBlock