use additional_config to enable cv parallel

David9857 · David9857 · commit b01cc9416a97 · 2025-05-29T22:10:40.000+08:00
Signed-off-by: David9857 &lt;985700846@qq.com&gt;
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -36,8 +36,6 @@
     lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
     "VLLM_ENABLE_MC2":
     lambda: bool(int(os.getenv("VLLM_ENABLE_MC2", '0'))),
-    "VLLM_ENABLE_CV_PARALLEL":
-    lambda: bool(int(os.getenv("VLLM_ENABLE_CV_PARALLEL", '0'))),
     "USING_LCCL_COM":
     lambda: bool(int(os.getenv("USING_LCCL_COM", '0'))),
     "SOC_VERSION":
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -71,7 +71,6 @@
 from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
-VLLM_ENABLE_CV_PARALLEL: bool = envs_ascend.VLLM_ENABLE_CV_PARALLEL
 
 
 class CustomDeepseekV2MLP(nn.Module):
@@ -179,6 +178,12 @@ def __init__(
         else:
             self.gate.e_score_correction_bias = None
 
+        self.enable_cv_parallel = False
+        additional_config = get_current_vllm_config().additional_config
+        if additional_config:
+            self.enable_cv_parallel = additional_config.get(
+                "enable_cv_parallel", False)
+
         self.experts = AscendFusedMoE(
             num_experts=config.n_routed_experts,
             top_k=config.num_experts_per_tok,
@@ -226,7 +231,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             enable_force_load_balance = False
         num_tokens, hidden_dim = hidden_states.shape
 
-        cv_parallel = VLLM_ENABLE_CV_PARALLEL and not is_prefill
+        cv_parallel = self.enable_cv_parallel and not is_prefill
 
         if self.n_shared_experts is not None:
             if not cv_parallel:
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -36,7 +36,6 @@
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
 USING_LCCL_COM: bool = envs_ascend.USING_LCCL_COM
-VLLM_ENABLE_CV_PARALLEL: bool = envs_ascend.VLLM_ENABLE_CV_PARALLEL
 
 
 def fused_experts_with_mc2(
@@ -811,6 +810,11 @@ def __init__(
 
         self.quant_method.create_weights(layer=self, **moe_quant_params)
 
+        self.enable_cv_parallel = False
+        if vllm_config.additional_config:
+            self.enable_cv_parallel = vllm_config.additional_config.get(
+                "enable_cv_parallel", False)
+
     def forward(self,
                 hidden_states: torch.Tensor,
                 router_logits: torch.Tensor,
@@ -847,7 +851,7 @@ def forward(self,
             enable_force_load_balance=enable_force_load_balance,
             **kwargs)
 
-        if VLLM_ENABLE_CV_PARALLEL and not is_prefill:
+        if self.enable_cv_parallel and not is_prefill:
             final_hidden_states, shared_output = final_hidden_states
 
         if VLLM_ENABLE_MC2 and not is_prefill:
@@ -857,6 +861,6 @@ def forward(self,
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
 
-        if VLLM_ENABLE_CV_PARALLEL and not is_prefill:
+        if self.enable_cv_parallel and not is_prefill:
             return final_hidden_states, shared_output
         return final_hidden_states