Skip to content

Commit bd4c9ed

Browse files
author
yangcheng (AJ)
committed
Merge remote-tracking branch 'upstream/v0.9.1-dev' into br_main_into_eplb
2 parents d79ace8 + e1d282d commit bd4c9ed

File tree

95 files changed

+5863
-1472
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+5863
-1472
lines changed

.github/dependabot.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@ version: 2
22
updates:
33
- package-ecosystem: "github-actions"
44
directory: "/"
5-
schedule:
6-
# Check for updates to GitHub Actions every week
7-
interval: "weekly"
85
open-pull-requests-limit: 2
96
reviewers:
107
- "Yikun"

.github/workflows/nightly_benchmarks.yaml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,7 @@
1818
name: 'Benchmarks / Performance'
1919
# This workflow runs nightly benchmarks for vllm-ascend.
2020

21-
on:
22-
schedule:
23-
# Run at 02:00 everyday
24-
- cron: '00 18 * * *'
25-
21+
on:
2622
workflow_dispatch:
2723
# Allow manual triggering of the workflow
2824

.github/workflows/vllm_ascend_doctest.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,6 @@ on:
2929
- 'tests/e2e/doctests/**'
3030
- 'tests/e2e/common.sh'
3131
- 'tests/e2e/run_doctests.sh'
32-
schedule:
33-
# Runs every 4 hours
34-
- cron: '0 */4 * * *'
3532

3633
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
3734
# declared as "shell: bash -el {0}" on steps that need to be properly activated.

.github/workflows/vllm_ascend_test.yaml

Lines changed: 68 additions & 183 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
name: 'test'
1919

2020
on:
21-
schedule:
22-
- cron: '0 23 * * *'
2321
pull_request:
2422
branches:
2523
- 'main'
@@ -44,12 +42,6 @@ defaults:
4442
run:
4543
shell: bash -el {0}
4644

47-
# only cancel in-progress runs of the same workflow
48-
# and ignore the lint / 1 card / 4 cards test type
49-
concurrency:
50-
group: ${{ github.workflow }}-${{ github.ref }}
51-
cancel-in-progress: true
52-
5345
jobs:
5446
lint:
5547
runs-on: ubuntu-latest
@@ -114,171 +106,32 @@ jobs:
114106
echo "::add-matcher::.github/workflows/matchers/mypy.json"
115107
tools/mypy.sh 1 ${{ matrix.python-version }}
116108
117-
ut:
118-
needs: [lint]
119-
name: unit test
120-
if: ${{ needs.lint.result == 'success' }}
121-
runs-on: ubuntu-latest
122-
container:
123-
image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
124-
env:
125-
VLLM_LOGGING_LEVEL: ERROR
126-
VLLM_USE_MODELSCOPE: True
127-
strategy:
128-
matrix:
129-
vllm_version: [main, v0.9.1]
130-
steps:
131-
- name: Install packages
132-
run: |
133-
apt-get update -y
134-
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev
135-
136-
- name: Checkout vllm-project/vllm repo
137-
uses: actions/checkout@v4
138-
with:
139-
repository: vllm-project/vllm
140-
ref: ${{ matrix.vllm_version }}
141-
path: ./vllm-empty
142-
143-
- name: Install vllm-project/vllm from source
144-
working-directory: ./vllm-empty
145-
run: |
146-
VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/
147-
python3 -m pip uninstall -y triton
148-
149-
- name: Checkout vllm-project/vllm-ascend repo
150-
uses: actions/checkout@v4
151-
152-
- name: Install vllm-project/vllm-ascend
153-
run: |
154-
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
155-
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
156-
python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
157-
python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/
158-
159-
- name: Run unit test for V1 Engine
160-
env:
161-
VLLM_USE_V1: 1
162-
VLLM_WORKER_MULTIPROC_METHOD: spawn
163-
TORCH_DEVICE_BACKEND_AUTOLOAD: 0
164-
run: |
165-
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
166-
pytest -sv tests/ut
167-
168109
e2e:
169110
needs: [lint]
170111
if: ${{ needs.lint.result == 'success' }}
171112
strategy:
172113
max-parallel: 2
173114
matrix:
174-
os: [linux-arm64-npu-1]
175-
vllm_version: [main, v0.9.1]
176-
name: singlecard e2e test
177-
runs-on: ${{ matrix.os }}
178-
container:
179-
# TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
180-
image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
181-
env:
182-
VLLM_LOGGING_LEVEL: ERROR
183-
steps:
184-
- name: Check npu and CANN info
185-
run: |
186-
npu-smi info
187-
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
188-
189-
- name: Config mirrors
190-
run: |
191-
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
192-
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
193-
apt-get update -y
194-
apt install git -y
195-
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
196-
197-
- name: Checkout vllm-project/vllm-ascend repo
198-
uses: actions/checkout@v4
199-
200-
- name: Install system dependencies
201-
run: |
202-
apt-get -y install `cat packages.txt`
203-
apt-get -y install gcc g++ cmake libnuma-dev
204-
205-
- name: Checkout vllm-project/vllm repo
206-
uses: actions/checkout@v4
207-
with:
208-
repository: vllm-project/vllm
209-
ref: ${{ matrix.vllm_version }}
210-
path: ./vllm-empty
211-
212-
- name: Install vllm-project/vllm from source
213-
working-directory: ./vllm-empty
214-
run: |
215-
VLLM_TARGET_DEVICE=empty pip install -e .
216-
217-
- name: Install vllm-project/vllm-ascend
218-
env:
219-
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
220-
run: |
221-
pip install -r requirements-dev.txt
222-
pip install -v -e .
223-
224-
- name: Run e2e test for V1 Engine
225-
env:
226-
VLLM_USE_V1: 1
227-
VLLM_WORKER_MULTIPROC_METHOD: spawn
228-
VLLM_USE_MODELSCOPE: True
229-
run: |
230-
pytest -sv tests/e2e/singlecard/test_offline_inference.py
231-
# TODO: switch hf to modelscope
232-
VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
233-
pytest -sv tests/e2e/singlecard/test_ilama_lora.py
234-
# TODO(sss): guided decoding doesn't work, fix it later
235-
# pytest -sv tests/e2e/singlecard/test_guided_decoding.py
236-
pytest -sv tests/e2e/singlecard/test_camem.py
237-
pytest -sv tests/e2e/singlecard/ \
238-
--ignore=tests/e2e/singlecard/test_offline_inference.py \
239-
--ignore=tests/e2e/singlecard/test_ilama_lora.py \
240-
--ignore=tests/e2e/singlecard/test_guided_decoding.py \
241-
--ignore=tests/e2e/singlecard/test_camem.py
242-
243-
- name: Run e2e test on V0 engine
244-
if: ${{ github.event_name == 'schedule' }}
245-
env:
246-
VLLM_USE_V1: 0
247-
VLLM_USE_MODELSCOPE: True
248-
run: |
249-
pytest -sv tests/e2e/singlecard/test_offline_inference.py
250-
# TODO: switch hf to modelscope
251-
VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
252-
pytest -sv tests/e2e/singlecard/test_ilama_lora.py
253-
# guided decoding doesn't work, fix it later
254-
# pytest -sv tests/e2e/singlecard/test_guided_decoding.py
255-
pytest -sv tests/e2e/singlecard/test_camem.py
256-
pytest -sv tests/e2e/singlecard/test_prompt_embedding.py
257-
pytest -sv tests/e2e/singlecard/ \
258-
--ignore=tests/e2e/singlecard/test_offline_inference.py \
259-
--ignore=tests/e2e/singlecard/test_ilama_lora.py \
260-
--ignore=tests/e2e/singlecard/test_guided_decoding.py \
261-
--ignore=tests/e2e/singlecard/test_camem.py \
262-
--ignore=tests/e2e/singlecard/test_prompt_embedding.py \
263-
--ignore=tests/e2e/singlecard/core/test_ascend_scheduler.py \
264-
--ignore=tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py
265-
266-
e2e-4-cards:
267-
needs: [e2e]
268-
if: ${{ needs.e2e.result == 'success' }}
269-
strategy:
270-
max-parallel: 1
271-
matrix:
272-
os: [linux-arm64-npu-4]
273-
vllm_version: [main, v0.9.1]
274-
name: multicard e2e test
115+
os: [linux-arm64-npu-1, linux-arm64-npu-4]
116+
vllm_version: [v0.9.1]
117+
concurrency:
118+
group: >
119+
${{
120+
matrix.os == 'linux-arm64-npu-4'
121+
&& github.event.pull_request.number
122+
&& format('pr-{0}-limit-npu-4', github.event.pull_request.number)
123+
|| format('job-{0}-{1}-{2}', matrix.os, matrix.vllm_version, github.event.pull_request.number)
124+
}}
125+
cancel-in-progress: false
126+
name: vLLM Ascend test
275127
runs-on: ${{ matrix.os }}
276128
container:
277129
# TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
278130
image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
279131
env:
132+
HF_ENDPOINT: https://hf-mirror.com
133+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
280134
VLLM_LOGGING_LEVEL: ERROR
281-
VLLM_USE_MODELSCOPE: True
282135
steps:
283136
- name: Check npu and CANN info
284137
run: |
@@ -324,32 +177,64 @@ jobs:
324177
env:
325178
VLLM_USE_V1: 1
326179
VLLM_WORKER_MULTIPROC_METHOD: spawn
327-
VLLM_USE_MODELSCOPE: True
328180
run: |
329-
# TODO: switch hf to modelscope
330-
VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
331-
pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
332-
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
333-
# To avoid oom, we need to run the test in a single process.
334-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
335-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
336-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
337-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
338-
pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py --ignore=tests/e2e/multicard/test_offline_inference_distributed.py
181+
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
182+
VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
183+
# guided decoding doesn't work, fix it later
184+
# pytest -sv tests/singlecard/test_guided_decoding.py.py
185+
# test_ascend_config.py should be ran separately because it will regenerate the global config many times.
186+
pytest -sv tests/singlecard/test_ascend_config.py
187+
pytest -sv tests/singlecard/test_camem.py
188+
pytest -sv tests/singlecard/core/test_ascend_scheduler.py
189+
pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py
190+
pytest -sv tests/singlecard/ \
191+
--ignore=tests/singlecard/test_offline_inference.py \
192+
--ignore=tests/singlecard/test_guided_decoding.py \
193+
--ignore=tests/singlecard/test_ascend_config.py \
194+
--ignore=tests/singlecard/test_camem.py \
195+
--ignore=tests/singlecard/core/test_ascend_scheduler.py \
196+
--ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
197+
else
198+
pytest -sv tests/multicard/test_ilama_lora_tp2.py
199+
# To avoid oom, we need to run the test in a single process.
200+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_w4a8_deepseek.py::test_deepseek_W4A8
201+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
202+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
203+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
204+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
205+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
206+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
207+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py --ignore=tests/multicard/test_w4a8_deepseek.py
208+
fi
339209
340210
- name: Run vllm-project/vllm-ascend test on V0 engine
341211
if: ${{ github.event_name == 'schedule' }}
342212
env:
343213
VLLM_USE_V1: 0
344-
VLLM_USE_MODELSCOPE: True
345214
run: |
346-
# TODO: switch hf to modelscope
347-
VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
348-
pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
349-
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
350-
# To avoid oom, we need to run the test in a single process.
351-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
352-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
353-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
354-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
355-
pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py --ignore=tests/e2e/multicard/test_offline_inference_distributed.py
215+
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
216+
VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
217+
# guided decoding doesn't work, fix it later
218+
# pytest -sv tests/singlecard/test_guided_decoding.py.py
219+
pytest -sv tests/singlecard/test_camem.py
220+
# test_ascend_config.py should be ran separately because it will regenerate the global config many times.
221+
pytest -sv tests/singlecard/test_ascend_config.py
222+
pytest -sv tests/singlecard/test_prompt_embedding.py
223+
pytest -sv tests/singlecard/ \
224+
--ignore=tests/singlecard/test_offline_inference.py \
225+
--ignore=tests/singlecard/test_guided_decoding.py \
226+
--ignore=tests/singlecard/test_camem.py \
227+
--ignore=tests/singlecard/test_ascend_config.py \
228+
--ignore=tests/singlecard/test_prompt_embedding.py \
229+
--ignore=tests/singlecard/core/test_ascend_scheduler.py \
230+
--ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
231+
else
232+
pytest -sv tests/multicard/test_ilama_lora_tp2.py
233+
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
234+
# To avoid oom, we need to run the test in a single process.
235+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
236+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
237+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
238+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
239+
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
240+
fi

.github/workflows/vllm_ascend_test_long_term.yaml

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,6 @@
1717
name: 'e2e test / long-term-test'
1818

1919
on:
20-
schedule:
21-
# Runs at 23:00 UTC (7:00 AM Beijing) every day
22-
- cron: '0 23 * * *'
2320
pull_request:
2421
types: [ labeled ]
2522

@@ -43,7 +40,7 @@ jobs:
4340
max-parallel: 2
4441
matrix:
4542
os: [linux-arm64-npu-1, linux-arm64-npu-4]
46-
vllm_version: [main, v0.9.1]
43+
vllm_version: [v0.9.1]
4744
name: vLLM Ascend long term test
4845
runs-on: ${{ matrix.os }}
4946
container:
@@ -97,13 +94,17 @@ jobs:
9794
- name: Run vllm-project/vllm-ascend long term test
9895
run: |
9996
if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
100-
# spec decode test
101-
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
97+
# v0 spec decode test
98+
# VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py # it needs a clean process
99+
# pytest -sv tests/long_term/spec_decode_v0 --ignore=tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
100+
# v1 spec decode test
101+
# TODO: revert me when test_v1_mtp_correctness.py is fixed
102+
VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
102103
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
103-
# VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
104-
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process
105-
pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
106-
pytest -sv tests/e2e/long_term/test_accuracy.py
104+
# VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_spec_decode.py
105+
# accuracy test single card
106+
pytest -sv tests/long_term/test_accuracy.py
107107
else
108-
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py
108+
# accuracy test multi card
109+
VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py
109110
fi

.github/workflows/vllm_ascend_test_pd.yaml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,6 @@
1717
name: 'e2e test / pd-disaggregation'
1818

1919
on:
20-
schedule:
21-
# Runs at 23:00 UTC (7:00 AM Beijing) every day
22-
- cron: '0 23 * * *'
2320
pull_request:
2421
types: [ labeled ]
2522

@@ -41,7 +38,7 @@ jobs:
4138
if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
4239
strategy:
4340
matrix:
44-
vllm_verison: [main, v0.9.1]
41+
vllm_verison: [v0.9.1]
4542
name: vLLM Ascend prefilling decoding disaggregation test
4643
runs-on: linux-arm64-npu-static-8
4744

@@ -106,3 +103,7 @@ jobs:
106103
- name: Run vllm-project/vllm-ascend PD Disaggregation test
107104
run: |
108105
pytest -sv tests/e2e/pd_disaggreate/test_pd_e2e.py
106+
107+
- name: Run vllm-project/vllm-ascend PD Disaggregation edge test
108+
run: |
109+
bash tests/e2e/pd_disaggreate/run_edge_case_test.sh

0 commit comments

Comments
 (0)