diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 6086bbb457..29f5a7f53f 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -85,8 +85,8 @@ jobs: }} runs-on: >- ${{ - (matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-arm64-npu-4') || - 'linux-arm64-npu-2' + (matrix.model_name == 'Qwen/Qwen3-30B-A3B' && 'linux-arrch64-a2-2') || + 'linux-arrch64-a2-1' }} strategy: matrix: diff --git a/.github/workflows/vllm_ascend_doctest.yaml b/.github/workflows/vllm_ascend_doctest.yaml index f26df372cd..462e67b1ff 100644 --- a/.github/workflows/vllm_ascend_doctest.yaml +++ b/.github/workflows/vllm_ascend_doctest.yaml @@ -48,7 +48,7 @@ jobs: matrix: vllm_verison: [v0.9.1-dev, v0.9.1-dev-openeuler, main, main-openeuler] name: vLLM Ascend test - runs-on: linux-arm64-npu-1 + runs-on: linux-arrch64-a2-1 container: image: m.daocloud.io/quay.io/ascend/vllm-ascend:${{ matrix.vllm_verison }} steps: diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 236b10f13c..3933cc2cd5 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -200,7 +200,7 @@ jobs: strategy: max-parallel: 2 matrix: - os: [linux-arm64-npu-1] + os: [linux-arrch64-a2-1] vllm_version: [main, v0.9.2] name: singlecard e2e test runs-on: ${{ matrix.os }} @@ -299,9 +299,9 @@ jobs: needs: [e2e] if: ${{ needs.e2e.result == 'success' }} strategy: - max-parallel: 1 + max-parallel: 2 matrix: - os: [linux-arm64-npu-4] + os: [linux-arrch64-a2-2] vllm_version: [main, v0.9.2] name: multicard e2e test runs-on: ${{ matrix.os }} diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index 9a33b3aca8..788a98a371 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -42,7 +42,7 @@ jobs: strategy: max-parallel: 2 matrix: - os: [linux-arm64-npu-1, linux-arm64-npu-4] + os: [linux-arrch64-a2-1, linux-arrch64-a2-2] vllm_version: [main, v0.9.2] name: vLLM Ascend long term test runs-on: ${{ matrix.os }} diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py index 2922c52c6c..1d60cb479c 100644 --- a/benchmarks/scripts/run_accuracy.py +++ b/benchmarks/scripts/run_accuracy.py @@ -50,17 +50,17 @@ # Command templates for running evaluations MODEL_RUN_INFO = { "Qwen/Qwen3-30B-A3B": - ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n" + ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n" "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen3-8B-Base": - ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" + ("export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6'\n" "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen2.5-VL-7B-Instruct": - ("export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2'\n" + ("export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2'\n" "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --batch_size 1"), } @@ -102,11 +102,11 @@ # Model arguments for evaluation MODEL_ARGS = { "Qwen/Qwen3-8B-Base": - "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6", + "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6", "Qwen/Qwen2.5-VL-7B-Instruct": - "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2", + "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2", "Qwen/Qwen3-30B-A3B": - "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True" + "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True" } # Whether to apply chat template formatting