handle clean code

weijinqian · weijinqian · commit c73953cf6d80 · 2025-07-11T22:59:01.000+08:00
Signed-off-by: weijinqian_v1 &lt;weijinqian@huawei.com&gt;
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -28,11 +28,8 @@ def get_fused_moe_state(ep_size: int, with_prefill: bool):
         return FusedMoEState.AllGather
     elif envs_ascend.VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ:
         # MC2 Dispatch/Combine performs better than alltoall_seq in decoding stage.
-        return (
-            FusedMoEState.All2AllSeq
-            if (ep_size < 16 or with_prefill)
-            else FusedMoEState.MC2
-        )
+        return (FusedMoEState.All2AllSeq if
+                (ep_size < 16 or with_prefill) else FusedMoEState.MC2)
     elif ep_size >= 16 and with_prefill and enable_chunk_mc2:
         return FusedMoEState.MC2_PREFILL
     # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
@@ -58,19 +55,16 @@ def set_ascend_forward_context(
     We add some additional param into forward_context.
     """
     with set_forward_context(
-        attn_metadata,
-        vllm_config,
-        virtual_engine=virtual_engine,
-        num_tokens=num_tokens,
-        num_tokens_across_dp=num_tokens_across_dp,
+            attn_metadata,
+            vllm_config,
+            virtual_engine=virtual_engine,
+            num_tokens=num_tokens,
+            num_tokens_across_dp=num_tokens_across_dp,
     ):
         forward_context = get_forward_context()
         forward_context.with_prefill = with_prefill
-        ep_size = (
-            torch.distributed.get_world_size()
-            if vllm_config.parallel_config.enable_expert_parallel
-            else 1
-        )
+        ep_size = (torch.distributed.get_world_size() if
+                   vllm_config.parallel_config.enable_expert_parallel else 1)
 
         fused_moe_state = get_fused_moe_state(ep_size, with_prefill)
 
@@ -88,18 +82,16 @@ def set_ascend_forward_context(
                 num_tokens = attn_metadata.num_actual_tokens
             else:
                 # for v0 engine
-                num_tokens = (
-                    attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
-                )
+                num_tokens = (attn_metadata.num_prefill_tokens +
+                              attn_metadata.num_decode_tokens)
 
         if num_actual_tokens is None:
             num_actual_tokens = num_tokens
 
         dp_world_size = get_dp_group().world_size
         if dp_world_size > 1 and forward_context.dp_metadata is not None:
             max_tokens_across_dp = (
-                forward_context.dp_metadata.max_tokens_across_dp_cpu.item()
-            )
+                forward_context.dp_metadata.max_tokens_across_dp_cpu.item())
         else:
             max_tokens_across_dp = num_tokens
 
@@ -110,31 +102,26 @@ def set_ascend_forward_context(
             world_size = torch.distributed.get_world_size()
             # NOTE: token num which need to pad to when mc2
             forward_context.padded_num_tokens = (
-                math.ceil(max_tokens_across_dp / tp_world_size) * tp_world_size
-            )
+                math.ceil(max_tokens_across_dp / tp_world_size) *
+                tp_world_size)
             # NOTE: mc2 op's param `global_bs`, add `world_size` to make `global_bs` absolutely larger than actual global_bs.
             forward_context.global_bs = (
-                math.ceil(max_tokens_across_dp / tp_world_size) * world_size
-            )
+                math.ceil(max_tokens_across_dp / tp_world_size) * world_size)
 
             if fused_moe_state == FusedMoEState.MC2_PREFILL:
                 chunk_size = envs.VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE
                 forward_context.max_num_chunks = math.ceil(
-                    math.ceil(max_tokens_across_dp / tp_world_size) / chunk_size
-                )
+                    math.ceil(max_tokens_across_dp / tp_world_size) /
+                    chunk_size)
 
-                forward_context.global_bs = (
-                    math.ceil(
-                        math.ceil(max_tokens_across_dp / tp_world_size)
-                        / forward_context.max_num_chunks
-                    )
-                    * world_size
-                )
+                forward_context.global_bs = (math.ceil(
+                    math.ceil(max_tokens_across_dp / tp_world_size) /
+                    forward_context.max_num_chunks) * world_size)
 
                 min_num_tokens = forward_context.max_num_chunks * tp_world_size
                 forward_context.padded_num_tokens = (
-                    math.ceil(max_tokens_across_dp / min_num_tokens) * min_num_tokens
-                )
+                    math.ceil(max_tokens_across_dp / min_num_tokens) *
+                    min_num_tokens)
 
             mc2_mask = torch.zeros(
                 forward_context.padded_num_tokens,
diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py
@@ -58,6 +58,6 @@ def register_model():
         ModelRegistry.register_model(
             "Qwen3MoeForCausalLM",
             "vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM")
-            
+
     ModelRegistry.register_model(
         "Qwen3ForCausalLM", "vllm_ascend.models.qwen3:CustomQwen3ForCausalLM")
diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py
@@ -147,7 +147,8 @@ def __init__(
                 intermediate_size=intermediate_size,
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
-                reduce_results=not envs_ascend.VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ,  # shared experts tp comm is separated in alltoallv for better overlap.
+                reduce_results=not envs_ascend.
+                VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ,  # shared experts tp comm is separated in alltoallv for better overlap.
                 prefix=f"{prefix}.shared_experts",
             )
         CustomDeepseekDBOMoE.top_k = config.num_experts_per_tok
@@ -232,7 +233,9 @@ def _forward_op_gating(
             chunk_hidden_states = torch.tensor_split(hidden_states,
                                                      self.tp_size,
                                                      dim=0)
-            chunked_hidden_states_sizes = [x.shape[0] for x in chunk_hidden_states]
+            chunked_hidden_states_sizes = [
+                x.shape[0] for x in chunk_hidden_states
+            ]
             local_hidden_states = chunk_hidden_states[self.tp_rank]
         else:
             local_hidden_states = hidden_states
@@ -245,7 +248,7 @@ def _forward_op_gating(
         if self.config.n_routed_experts == 256:
             topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
                 router_logits,
-                k=self.config.num_experts_per_tok,  
+                k=self.config.num_experts_per_tok,
                 bias=self.gate.e_score_correction_bias,
                 k_group=self.config.topk_group,  # fix: 4
                 group_count=self.config.n_group,  # fix 8
@@ -273,7 +276,8 @@ def _forward_op_gating(
         # to avoid accumulating too much tokens on a single rank.
         # currently it is only activated when doing profile runs.
         if enable_force_load_balance:
-            topk_ids = torch.randint_like(topk_ids, 0, self.config.n_routed_experts)
+            topk_ids = torch.randint_like(topk_ids, 0,
+                                          self.config.n_routed_experts)
 
         return topk_weights, topk_ids, local_hidden_states, chunked_hidden_states_sizes
 
diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
@@ -33,6 +33,7 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
             "gate_proj",
             "up_proj",
         ],
-        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+        "experts":
+        ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
     }
     qwen3.Qwen3MoeSparseMoeBlock = AscendSparseMoeBlock
diff --git a/vllm_ascend/multistream/ms_split.py b/vllm_ascend/multistream/ms_split.py
@@ -294,8 +294,8 @@ def model_input_split_v1_attn(
                                                  token_index)
 
     is_only_prefill_pre = is_only_prefill_post = attn_metadata.is_only_prefill
-    has_prefill_pre, _ = torch.any(
-        query_lens_pre > 1).item(), torch.any(query_lens_post > 1).item()
+    has_prefill_pre, _ = torch.any(query_lens_pre > 1).item(), torch.any(
+        query_lens_post > 1).item()
 
     if not attn_metadata.is_only_prefill:
         is_only_prefill_post = torch.all(query_lens_post > 1).item()
diff --git a/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py b/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py
@@ -232,7 +232,6 @@ def preprocess(self,
 
         ep_size = self.ep_size
 
-
         # Dropless
         self.num_out_tokens = indices.numel()
         if self.ep_size > 1 or self.num_local_experts > 1:
@@ -408,7 +407,6 @@ def preprocess_and_permtute1(self,
                 shared_output = shared_experts(shared_experts_input)
                 self.cached_shared_expert_output = shared_output
 
-
             hidden_states, self.reversed_local_input_permutation_mapping = torch_npu.npu_moe_token_permute(
                 tokens=hidden_states,
                 indices=self.top_indices,
@@ -542,8 +540,8 @@ def alltoall_token_unpermutation2(permutated_local_input_tokens):
 
             output = torch_npu.npu_moe_token_unpermute(
                 permuted_tokens=permutated_local_input_tokens,
-                sorted_indices=self.
-                reversed_local_input_permutation_mapping.to(torch.int32),
+                sorted_indices=self.reversed_local_input_permutation_mapping.
+                to(torch.int32),
                 probs=self.probs,
                 restore_shape=self.hidden_shape_before_permute)
 

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):`
`33`	`33`	`"gate_proj",`
`34`	`34`	`"up_proj",`
`35`	`35`	`],`
`36`		`- "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],`
	`36`	`+ "experts":`
	`37`	`+ ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],`
`37`	`38`	`}`
`38`	`39`	`qwen3.Qwen3MoeSparseMoeBlock = AscendSparseMoeBlock`