From 37f1170dfb26ffedab28f320c4cd83473a863eb4 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Tue, 8 Jul 2025 17:09:40 +0800
Subject: [PATCH 1/3] upgrade ci npu to 64g per chip

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 .github/workflows/accuracy_test.yaml              |  4 ++--
 .github/workflows/vllm_ascend_doctest.yaml        |  2 +-
 .github/workflows/vllm_ascend_test.yaml           |  6 +++---
 .github/workflows/vllm_ascend_test_long_term.yaml |  2 +-
 benchmarks/scripts/run_accuracy.py                | 12 ++++++------
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml
index 6086bbb457..29f5a7f53f 100644
--- a/.github/workflows/accuracy_test.yaml
+++ b/.github/workflows/accuracy_test.yaml
@@ -85,8 +85,8 @@ jobs:
       }}
     runs-on: >-
       ${{
-          (matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-arm64-npu-4') ||
-          'linux-arm64-npu-2'
+          (matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-arrch64-a2-2') ||
+          'linux-arrch64-a2-1'
       }}
     strategy:
       matrix:
diff --git a/.github/workflows/vllm_ascend_doctest.yaml b/.github/workflows/vllm_ascend_doctest.yaml
index f26df372cd..462e67b1ff 100644
--- a/.github/workflows/vllm_ascend_doctest.yaml
+++ b/.github/workflows/vllm_ascend_doctest.yaml
@@ -48,7 +48,7 @@ jobs:
       matrix:
         vllm_verison: [v0.9.1-dev, v0.9.1-dev-openeuler, main, main-openeuler]
     name: vLLM Ascend test
-    runs-on: linux-arm64-npu-1
+    runs-on: linux-arrch64-a2-1
     container:
       image: m.daocloud.io/quay.io/ascend/vllm-ascend:${{ matrix.vllm_verison }}
     steps:
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index 236b10f13c..9bb809296d 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -200,7 +200,7 @@ jobs:
     strategy:
       max-parallel: 2
       matrix:
-        os: [linux-arm64-npu-1]
+        os: [linux-arrch64-a2-1]
         vllm_version: [main, v0.9.2]
     name: singlecard e2e test
     runs-on: ${{ matrix.os }}
@@ -299,9 +299,9 @@ jobs:
     needs: [e2e]
     if: ${{ needs.e2e.result == 'success' }}
     strategy:
-      max-parallel: 1
+      max-parallel: 2
       matrix:
-        os: [linux-arm64-npu-4]
+        os: [linux-arrch64-a2-4]
         vllm_version: [main, v0.9.2]
     name: multicard e2e test
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml
index 9a33b3aca8..c328d6e131 100644
--- a/.github/workflows/vllm_ascend_test_long_term.yaml
+++ b/.github/workflows/vllm_ascend_test_long_term.yaml
@@ -42,7 +42,7 @@ jobs:
     strategy:
       max-parallel: 2
       matrix:
-        os: [linux-arm64-npu-1, linux-arm64-npu-4]
+        os: [linux-arrch64-a2-1, linux-arm64-npu-2]
         vllm_version: [main, v0.9.2]
     name: vLLM Ascend long term test
     runs-on: ${{ matrix.os }}
diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py
index 2922c52c6c..1d60cb479c 100644
--- a/benchmarks/scripts/run_accuracy.py
+++ b/benchmarks/scripts/run_accuracy.py
@@ -50,17 +50,17 @@
 # Command templates for running evaluations
 MODEL_RUN_INFO = {
     "Qwen/Qwen3-30B-A3B":
-    ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n"
+    ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n"
      "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
      "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
      ),
     "Qwen/Qwen3-8B-Base":
-    ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
+    ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6'\n"
      "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
      "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
      ),
     "Qwen/Qwen2.5-VL-7B-Instruct":
-    ("export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2'\n"
+    ("export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2'\n"
      "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
      "--apply_chat_template --fewshot_as_multiturn  --batch_size 1"),
 }
@@ -102,11 +102,11 @@
 # Model arguments for evaluation
 MODEL_ARGS = {
     "Qwen/Qwen3-8B-Base":
-    "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6",
+    "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6",
     "Qwen/Qwen2.5-VL-7B-Instruct":
-    "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2",
+    "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2",
     "Qwen/Qwen3-30B-A3B":
-    "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True"
+    "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True"
 }
 
 # Whether to apply chat template formatting

From 50c51566adec79cb5aee01ef7370431d3b75c474 Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Tue, 8 Jul 2025 17:12:16 +0800
Subject: [PATCH 2/3] fix

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 .github/workflows/vllm_ascend_test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index 9bb809296d..3933cc2cd5 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -301,7 +301,7 @@ jobs:
     strategy:
       max-parallel: 2
       matrix:
-        os: [linux-arrch64-a2-4]
+        os: [linux-arrch64-a2-2]
         vllm_version: [main, v0.9.2]
     name: multicard e2e test
     runs-on: ${{ matrix.os }}

From 5fd5d75b76ebddc945affa2e1964ef8c682bb73a Mon Sep 17 00:00:00 2001
From: wangli <wangli858794774@gmail.com>
Date: Tue, 8 Jul 2025 17:15:06 +0800
Subject: [PATCH 3/3] fix typo

Signed-off-by: wangli <wangli858794774@gmail.com>
---
 .github/workflows/vllm_ascend_test_long_term.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml
index c328d6e131..788a98a371 100644
--- a/.github/workflows/vllm_ascend_test_long_term.yaml
+++ b/.github/workflows/vllm_ascend_test_long_term.yaml
@@ -42,7 +42,7 @@ jobs:
     strategy:
       max-parallel: 2
       matrix:
-        os: [linux-arrch64-a2-1, linux-arm64-npu-2]
+        os: [linux-arrch64-a2-1, linux-arrch64-a2-2]
         vllm_version: [main, v0.9.2]
     name: vLLM Ascend long term test
     runs-on: ${{ matrix.os }}