From 07411f9f645ee29543b71f6d34fa58e1b6a0372d Mon Sep 17 00:00:00 2001 From: 22dimensions Date: Tue, 10 Jun 2025 10:07:36 +0800 Subject: [PATCH 01/28] [CI] remove old quantization model (#1003) remove old quantization model, and new models will be added to testcase later. Signed-off-by: 22dimensions Signed-off-by: wangxiaoxin (A) --- tests/singlecard/test_offline_inference.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py index 553d109f3..006883863 100644 --- a/tests/singlecard/test_offline_inference.py +++ b/tests/singlecard/test_offline_inference.py @@ -37,6 +37,10 @@ "Qwen/Qwen3-0.6B-Base", ] MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"] + +QUANTIZATION_MODELS = [ + "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new", +] os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" QUANTIZATION_MODELS = [ From 9181e92fa316ead9f27d87d48dc93e5cb1836e54 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Tue, 10 Jun 2025 13:29:09 +0800 Subject: [PATCH 02/28] Update 0.9.0rc1 contributors info (#1148) ### What this PR does / why we need it? Update 0.9.0rc1 contributors info ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed Signed-off-by: Yikun Jiang Signed-off-by: wangxiaoxin (A) --- docs/source/community/contributors.md | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/docs/source/community/contributors.md b/docs/source/community/contributors.md index efbea6657..d61259de8 100644 --- a/docs/source/community/contributors.md +++ b/docs/source/community/contributors.md @@ -1,4 +1,6 @@ -# Maintainers +# Maintainers and contributors + +## Maintainers | Name | Github ID | Date | |:-----------:|:-----:|:-----:| @@ -6,13 +8,30 @@ | Yikun Jiang| [@Yikun](https://github.com/Yikun) | 2025/02 | | Yi Gan| [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/02 | -# Contributors +## Contributors vLLM Ascend every release would not have been possible without the following contributors: +Updated on 2025-06-10: + | Number | Contributor | Date | Commit ID | |:------:|:-----------:|:-----:|:---------:| -| 51 | [@YisongJiang](https://github.com/YisongJiang) | 2025/5/29 | [90afaf6](https://github.com/vllm-project/vllm-ascend/commit/90afaf6306f680307462becf3c78585737579851) | +| 66 | [@Yuxiao-Xu](https://github.com/Yuxiao-Xu) | 2025/6/9 | [6b853f1](https://github.com/vllm-project/vllm-ascend/commit/6b853f15fe69ba335d2745ebcf14a164d0bcc505) | +| 65 | [@ChenTaoyu-SJTU](https://github.com/ChenTaoyu-SJTU) | 2025/6/7 | [20dedba](https://github.com/vllm-project/vllm-ascend/commit/20dedba5d1fc84b7ae8b49f9ce3e3649389e2193) | +| 64 | [@zxdukki](https://github.com/zxdukki) | 2025/6/7 | [87ebaef](https://github.com/vllm-project/vllm-ascend/commit/87ebaef4e4e519988f27a6aa378f614642202ecf) | +| 63 | [@sdmyzlp](https://github.com/sdmyzlp) | 2025/6/7 | [3640c60](https://github.com/vllm-project/vllm-ascend/commit/3640c60b0eb4d4cb104e20bfa406d3f1d17920a7) | +| 62 | [@weijinqian0](https://github.com/weijinqian0) | 2025/6/7 | [e9ada68](https://github.com/vllm-project/vllm-ascend/commit/e9ada685ece798f9fe0d4a287e3f5246a8a7207b) | +| 61 | [@hahazhky](https://github.com/hahazhky) | 2025/6/6 | [0b12c2a](https://github.com/vllm-project/vllm-ascend/commit/0b12c2acf7d9fd192beebebf662298067d9a5435) | +| 60 | [@depeng1994](https://github.com/depeng1994) | 2025/6/6 | [6b094a2](https://github.com/vllm-project/vllm-ascend/commit/6b094a2bd49a8a41eb3647568b2d9e5b337db81f) | +| 59 | [@David9857](https://github.com/David9857) | 2025/6/5 | [78431b3](https://github.com/vllm-project/vllm-ascend/commit/78431b34694dfa3c8f54ed7cc626660318557927) | +| 58 | [@momo609](https://github.com/momo609) | 2025/6/5 | [908a851](https://github.com/vllm-project/vllm-ascend/commit/908a851a776cfd9051cc062119e6ec481561c6f7) | +| 57 | [@zhangxinyuehfad](https://github.com/zhangxinyuehfad) | 2025/6/5 | [7737aaa](https://github.com/vllm-project/vllm-ascend/commit/7737aaa40f699b233a35fb61e908b687adc1e2e5) | +| 56 | [@NINGBENZHE](https://github.com/NINGBENZHE) | 2025/6/3 | [6ec64a3](https://github.com/vllm-project/vllm-ascend/commit/6ec64a3f9686df65b5a23a41aa301e669db19099) | +| 55 | [@XWFAlone](https://github.com/XWFAlone) | 2025/5/30 | [3442fbd](https://github.com/vllm-project/vllm-ascend/commit/3442fbdb235b4c6d72c2bc64a49707a7bd89958e) | +| 54 | [@YisongJiang](https://github.com/YisongJiang) | 2025/5/29 | [90afaf6](https://github.com/vllm-project/vllm-ascend/commit/90afaf6306f680307462becf3c78585737579851) | +| 53 | [@ponix-j](https://github.com/ponix-j) | 2025/5/23 | [df58fb8](https://github.com/vllm-project/vllm-ascend/commit/df58fb80eee24139fc61c495be3ce79cf81b3f73) | +| 52 | [@ttanzhiqiang](https://github.com/ttanzhiqiang) | 2025/5/23 | [dc6172e](https://github.com/vllm-project/vllm-ascend/commit/dc6172efd3860ce95b40a7b3e93611f875f06d40) | +| 51 | [@yangpuPKU](https://github.com/yangpuPKU) | 2025/5/23 | [46df67a](https://github.com/vllm-project/vllm-ascend/commit/46df67a5e9ab73fade08cbb2d8c0155cee7316d1) | | 50 | [@wonderful199082](https://github.com/wonderful199082) | 2025/5/20 | [5cf9ff1](https://github.com/vllm-project/vllm-ascend/commit/5cf9ff18e91b0b7031c258d71a257b8e24689763) | | 49 | [@22dimensions](https://github.com/22dimensions) | 2025/5/17 | [a8730e7](https://github.com/vllm-project/vllm-ascend/commit/a8730e7a3c4ac6c4b39a5946c943252fdea6cce5) | | 48 | [@cxcxflying](https://github.com/cxcxflying) | 2025/5/13 | [e564470](https://github.com/vllm-project/vllm-ascend/commit/e56447033889ca95df512208cab22ef832bfdf07) | From 3e55d9e269a79c9f9bc7dcf05796afe3953742c3 Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Tue, 10 Jun 2025 14:35:44 +0800 Subject: [PATCH 03/28] [CI] Make accuarcy CI and report work (#1078) ### What this PR does / why we need it? Make accuarcy CI and report work ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Manaully review Signed-off-by: hfadzxy Signed-off-by: wangxiaoxin (A) --- .github/workflows/accuracy_report.yaml | 224 ++++++++++++------ .github/workflows/accuracy_test.yaml | 13 +- benchmarks/scripts/run_accuracy.py | 8 +- .../evaluation/accuracy_report/index.md | 6 + .../developer_guide/evaluation/index.md | 1 + 5 files changed, 170 insertions(+), 82 deletions(-) create mode 100644 docs/source/developer_guide/evaluation/accuracy_report/index.md diff --git a/.github/workflows/accuracy_report.yaml b/.github/workflows/accuracy_report.yaml index 1d03e1318..fe1dbd103 100644 --- a/.github/workflows/accuracy_report.yaml +++ b/.github/workflows/accuracy_report.yaml @@ -19,110 +19,184 @@ name: Accuracy Report on: workflow_dispatch: inputs: - branch: - description: 'choose a dev branch to pr' + vllm-ascend-branch: + description: 'vllm-ascend branch:' required: true - vllm-ascend-version: - description: 'what vllm-ascend version to accuracy test?' + type: choice + options: + - main + - v0.7.3-dev + models: + description: 'models:' required: true - type: string + type: choice + options: + - all + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen3-8B-Base + default: 'all' + jobs: - download: + download_reports: runs-on: ubuntu-latest + strategy: + matrix: + model: ${{ fromJSON( + (github.event.inputs.models == 'all' && + '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || + (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' && + '["Qwen/Qwen2.5-7B-Instruct"]') || + (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && + '["Qwen/Qwen2.5-VL-7B-Instruct"]') || + (github.event.inputs.models == 'Qwen/Qwen3-8B-Base' && + '["Qwen/Qwen3-8B-Base"]') + ) }} + + version: [0, 1] + exclude: + - model: 'Qwen/Qwen2.5-VL-7B-Instruct' + version: 1 + fail-fast: false + + name: Download ${{ matrix.model }} V${{ matrix.version }} steps: - name: Checkout repository uses: actions/checkout@v4 with: - ref: ${{ github.event.inputs.branch }} - - - name: Debug List Artifacts - run: gh api /repos/${{ github.repository }}/actions/artifacts - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ref: ${{ github.event.inputs.vllm-ascend-branch }} - - name: Query artifact run id for Qwen2.5-VL-7B-Instruct V0 latest artifact - id: get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0 + - name: Get base model name + id: get_basename run: | - ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts") - RUN_ID=$(echo "$ARTIFACT_JSON" | \ - jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-VL-7B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id') - echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT" - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + model_base_name=$(basename "${{ matrix.model }}") + echo "model_base_name=$model_base_name" >> $GITHUB_OUTPUT + shell: bash - - name: Query artifact run id for Qwen2.5-7B-Instruct V0 latest artifact - id: get_Qwen2_5_7B_Instruct_latest_run_id_V0 + - name: Query artifact run id + id: get_run_id run: | - ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts") + ARTIFACT_PATTERN="${{ github.event.inputs.vllm-ascend-branch }}-${{ steps.get_basename.outputs.model_base_name }}-V${{ matrix.version }}-report" + echo "Querying artifacts with pattern: $ARTIFACT_PATTERN" + + ARTIFACT_JSON=$(gh api --paginate /repos/${{ github.repository }}/actions/artifacts || echo "{}") + RUN_ID=$(echo "$ARTIFACT_JSON" | \ - jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-7B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id') - echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT" + jq -s -r --arg pattern "$ARTIFACT_PATTERN" \ + '[.[].artifacts[]] | map(select(.name | test($pattern))) | sort_by(.created_at) | last | .workflow_run.id // empty') + + if [ -z "$RUN_ID" ]; then + echo "::warning::No artifact found matching pattern $ARTIFACT_PATTERN. Skipping download." + echo "runid=" >> $GITHUB_OUTPUT + else + echo "Found matching artifact with run ID: $RUN_ID" + echo "runid=$RUN_ID" >> $GITHUB_OUTPUT + fi env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Query artifact run id for Qwen3-8B-Base V0 latest artifact - id: get_Qwen3_8B_Base_latest_run_id_V0 - run: | - ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts") - RUN_ID=$(echo "$ARTIFACT_JSON" | \ - jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen3-8B-Base-V0-report")] | sort_by(.created_at) | last | .workflow_run.id') - echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT" - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Download Qwen/Qwen2.5-VL-7B-Instruct V0 Artifact + - name: Download Artifact + if: ${{ steps.get_run_id.outputs.runid != '' }} uses: actions/download-artifact@v4 with: - name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-VL-7B-Instruct-V0-report - path: ./docs/source/developer_guide/evaluation/accuracy_report - github-token: ${{ secrets.GITHUB_TOKEN }} - repository: vllm-project/vllm-ascend - run-id: ${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }} + name: ${{ github.event.inputs.vllm-ascend-branch }}-${{ steps.get_basename.outputs.model_base_name }}-V${{ matrix.version }}-report + path: ./docs/source/developer_guide/evaluation/accuracy_report_bak + github-token: ${{ secrets.GITHUB_TOKEN }} + repository: ${{ github.repository }} + run-id: ${{ steps.get_run_id.outputs.runid }} + + - name: Upload reports artifact + if: ${{ steps.get_run_id.outputs.runid != '' }} + uses: actions/upload-artifact@v4 + with: + name: report-${{ steps.get_basename.outputs.model_base_name }}-v${{ matrix.version }} + path: ./docs/source/developer_guide/evaluation/accuracy_report_bak/*.md + retention-days: 90 - - name: Download Qwen/Qwen2.5-7B-Instruct Artifact - uses: actions/download-artifact@v4 + create_pr: + runs-on: ubuntu-latest + needs: download_reports + steps: + - name: Checkout repository + uses: actions/checkout@v4 with: - name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-7B-Instruct-V0-report - path: ./docs/source/developer_guide/evaluation/accuracy_report - github-token: ${{ secrets.GITHUB_TOKEN }} - repository: vllm-project/vllm-ascend - run-id: ${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }} + ref: ${{ github.event.inputs.vllm-ascend-branch }} + + - name: Setup workspace + run: mkdir -p ./accuracy/accuracy_report - - name: Download Qwen/Qwen3-8B-Base Artifact + - name: Download only current run reports uses: actions/download-artifact@v4 with: - name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen3-8B-Base-V0-report path: ./docs/source/developer_guide/evaluation/accuracy_report + pattern: report-* github-token: ${{ secrets.GITHUB_TOKEN }} - repository: vllm-project/vllm-ascend - run-id: ${{ steps.get_Qwen3_8B_Base_latest_run_id_V0.outputs.runid }} + run-id: ${{ github.run_id }} + + - name: Delete old report + run: | + find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete + find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \; + find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete - - name: Display Files - working-directory: ./docs/source/developer_guide/evaluation/accuracy_report + - name: Generate step summary + if: ${{ always() }} run: | - cat ./Qwen2.5-VL-7B-Instruct.md - cat ./Qwen2.5-7B-Instruct.md - cat ./Qwen3-8B-Base.md - - - name: Create Pull Request for markdown update + for report in ./docs/source/developer_guide/evaluation/accuracy_report/*.md; do + filename=$(basename "$report") + # skip index.md + if [ "$filename" = "index.md" ]; then + continue + fi + + if [ -f "$report" ]; then + { + echo -e "\n\n---\n" + echo "## 📄 Report File: $(basename $report)" + cat "$report" + } >> "$GITHUB_STEP_SUMMARY" + fi + done + + - name: Update accuracy_report/index.md + run: | + REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report" + INDEX_MD="$REPORT_DIR/index.md" + + { + echo "# Accuracy Report" + echo "" + echo "::: {toctree}" + echo ":caption: Accuracy Report" + echo ":maxdepth: 1" + + for report in "$REPORT_DIR"/*.md; do + filename="$(basename "$report" .md)" + if [ "$filename" != "index" ]; then + echo "$filename" + fi + done + + echo ":::" + } > "$INDEX_MD" + + - name: Create Pull Request uses: peter-evans/create-pull-request@v7 with: token: ${{ secrets.PR_TOKEN }} - base: ${{ github.event.inputs.branch }} - branch: auto-pr/accuracy-test - commit-message: "Update accuracy report for ${{ github.event.inputs.branch }}" + base: ${{ github.event.inputs.vllm-ascend-branch }} + branch: auto-pr/accuracy-report + commit-message: "Update accuracy reports for ${{ github.event.inputs.vllm-ascend-branch }}" add-paths: ./docs/source/developer_guide/evaluation/accuracy_report/*.md - title: "[Doc]Update accuracy report for ${{ github.event.inputs.branch }}" + title: "[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-branch }}" body: | - The accuracy results running on Ascend NPU have changed, I'm updating the report. - Please review the changes. - + The accuracy results running on NPU Altlas A2 have changed, updating reports for: + ${{ + github.event.inputs.models == 'all' + && 'All models (Qwen2.5-7B-Instruct, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)' + || github.event.inputs.models + }} + - [Workflow run][1] - - [Qwen2.5-7B-Instruct accuracy report][2] - - [Qwen2.5-VL-7B-Instruct accuracy report][3] - - [Qwen3-8B-Base accuracy report][4] - - [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - [2]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }} - [3]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }} - [4]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen3_8B_Base_latest_run_id_V0.outputs.runid }} + + [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} \ No newline at end of file diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index d0a7debaa..d1a61230e 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -96,7 +96,7 @@ jobs: # - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct model_name: ${{ fromJSON( (github.event.inputs.models == 'all' && - '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","model_name":"Qwen/Qwen3-8B-Base"]') || + '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' && '["Qwen/Qwen2.5-7B-Instruct"]') || (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && @@ -201,6 +201,7 @@ jobs: pip show torch | grep "Version:" | awk '{print "GHA_TORCH_VERSION="$2}' pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}' pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//' + echo "GHA_VLLM_ASCEND_VERSION=${{ github.event.inputs.vllm-ascend-version || github.ref }}" } >> "$GITHUB_ENV" - name: Print versions @@ -209,7 +210,7 @@ jobs: echo "Torch NPU: ${{ env.GHA_TORCH_NPU_VERSION }}" echo "Torch: ${{ env.GHA_TORCH_VERSION }}" echo "vLLM: ${{ env.GHA_VLLM_VERSION }}" - echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}" + echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION }}" - name: Run Accuracy Test for V${{ matrix.vllm_use_version }} id: report @@ -238,10 +239,16 @@ jobs: run: | cat ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md >> $GITHUB_STEP_SUMMARY + - name: Sanitize version string for artifact naming + run: | + SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}" + echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV" + - name: Upload Report for V${{ matrix.vllm_use_version }} + if: ${{ github.event_name == 'workflow_dispatch' }} uses: actions/upload-artifact@v4 with: - name: "${{ env.GHA_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}-report" + name: "${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}-report" path: ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md if-no-files-found: warn retention-days: 90 diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py index f508feda7..79c58bcbc 100644 --- a/benchmarks/scripts/run_accuracy.py +++ b/benchmarks/scripts/run_accuracy.py @@ -26,7 +26,7 @@ import lm_eval import torch -UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B"] +UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B-Base"] UNIMODAL_TASK = ["ceval-valid", "gsm8k"] MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"] MULTIMODAL_TASK = ["mmmu_val"] @@ -36,17 +36,17 @@ MODEL_RUN_INFO = { "Qwen/Qwen2.5-7B-Instruct": ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" - "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n" + "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen3-8B-Base": ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" - "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n" + "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen2.5-VL-7B-Instruct": ("export MODEL_ARGS='pretrained={model}, max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2'\n" - "lm_eval --model vllm-vlm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n" + "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --batch_size 1"), } diff --git a/docs/source/developer_guide/evaluation/accuracy_report/index.md b/docs/source/developer_guide/evaluation/accuracy_report/index.md new file mode 100644 index 000000000..51fae4886 --- /dev/null +++ b/docs/source/developer_guide/evaluation/accuracy_report/index.md @@ -0,0 +1,6 @@ +# Accuracy Report + +:::{toctree} +:caption: Accuracy Report +:maxdepth: 1 +::: \ No newline at end of file diff --git a/docs/source/developer_guide/evaluation/index.md b/docs/source/developer_guide/evaluation/index.md index 324c2e2f2..ea861abb1 100644 --- a/docs/source/developer_guide/evaluation/index.md +++ b/docs/source/developer_guide/evaluation/index.md @@ -6,6 +6,7 @@ using_lm_eval using_opencompass using_evalscope +accuracy_report/index ::: :::{toctree} From d1095bca6eeabb52ad204ed0539a0543606f11f0 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 10 Jun 2025 17:14:25 +0800 Subject: [PATCH 04/28] [Bugfix] add compilation/__init__.py to fix import error (#1152) 1. Add `__init__.py` for vllm_ascend/compilation to make sure it's a python module 2. Fix model runner bug to keep the same with vllm 3. Add release note for 0.9.0rc2 --------- Signed-off-by: wangxiyuan Signed-off-by: wangxiaoxin (A) --- docs/source/conf.py | 4 ++-- docs/source/developer_guide/versioning_policy.md | 3 +++ docs/source/faqs.md | 10 ++++------ docs/source/user_guide/graph_mode.md | 2 +- docs/source/user_guide/release_notes.md | 8 ++++++++ 5 files changed, 18 insertions(+), 9 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index ba8d83949..63fb5dbe4 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -69,10 +69,10 @@ # the branch of vllm-ascend, used in vllm-ascend clone and image tag # - main branch: 'main' # - vX.Y.Z branch: latest vllm-ascend release tag - 'vllm_ascend_version': 'v0.9.0rc1', + 'vllm_ascend_version': 'v0.9.0rc2', # the newest release version of vllm-ascend and matched vLLM, used in pip install. # This value should be updated when cut down release. - 'pip_vllm_ascend_version': "0.9.0rc1", + 'pip_vllm_ascend_version': "0.9.0rc2", 'pip_vllm_version': "0.9.0", # CANN image tag 'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10", diff --git a/docs/source/developer_guide/versioning_policy.md b/docs/source/developer_guide/versioning_policy.md index bbb08c3be..1533b0af5 100644 --- a/docs/source/developer_guide/versioning_policy.md +++ b/docs/source/developer_guide/versioning_policy.md @@ -22,6 +22,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin: | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | MindIE Turbo | |-------------|--------------|------------------|-------------|--------------------|--------------| +| v0.9.0rc2 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.9.0rc1 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.8.5rc1 | v0.8.5.post1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.8.4rc2 | v0.8.4 | >= 3.9, < 3.12 | 8.0.0 | 2.5.1 / 2.5.1 | | @@ -34,6 +35,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin: | Date | Event | |------------|-------------------------------------------| +| 2025.06.10 | Release candidates, v0.9.0rc2 | | 2025.06.09 | Release candidates, v0.9.0rc1 | | 2025.05.29 | v0.7.x post release, v0.7.3.post1 | | 2025.05.08 | v0.7.x Final release, v0.7.3 | @@ -71,6 +73,7 @@ Usually, each minor version of vLLM (such as 0.7) will correspond to a vLLM Asce | Branch | Status | Note | |------------|--------------|--------------------------------------| | main | Maintained | CI commitment for vLLM main branch and vLLM 0.9.x branch | +| v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.0 and 0.9.1 version | | v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version | | v0.7.1-dev | Unmaintained | Replaced by v0.7.3-dev | diff --git a/docs/source/faqs.md b/docs/source/faqs.md index 8c840be45..1d355b5e9 100644 --- a/docs/source/faqs.md +++ b/docs/source/faqs.md @@ -3,7 +3,7 @@ ## Version Specific FAQs - [[v0.7.3.post1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1007) -- [[v0.9.0rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1115) +- [[v0.9.0rc2] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1115) ## General FAQs @@ -69,14 +69,14 @@ If all above steps are not working, feel free to submit a GitHub issue. ### 7. How does vllm-ascend perform? -Currently, only some models are improved. Such as `Qwen2 VL`, `Deepseek V3`. Others are not good enough. In the future, we will support graph mode and custom ops to improve the performance of vllm-ascend. And when the official release of vllm-ascend is released, you can install `mindie-turbo` with `vllm-ascend` to speed up the inference as well. +Currently, only some models are improved. Such as `Qwen2 VL`, `Deepseek V3`. Others are not good enough. From 0.9.0rc2, Qwen and Deepseek works with graph mode to play a good performance. What's more, you can install `mindie-turbo` with `vllm-ascend v0.7.3` to speed up the inference as well. ### 8. How vllm-ascend work with vllm? vllm-ascend is a plugin for vllm. Basically, the version of vllm-ascend is the same as the version of vllm. For example, if you use vllm 0.7.3, you should use vllm-ascend 0.7.3 as well. For main branch, we will make sure `vllm-ascend` and `vllm` are compatible by each commit. ### 9. Does vllm-ascend support Prefill Disaggregation feature? -Currently, only 1P1D is supported by vllm. For vllm-ascend, it'll be done by [this PR](https://github.com/vllm-project/vllm-ascend/pull/432). For NPND, vllm is not stable and fully supported yet. We will make it stable and supported by vllm-ascend in the future. +Currently, only 1P1D is supported on V0 Engine. For V1 Engine or NPND support, We will make it stable and supported by vllm-ascend in the future. ### 10. Does vllm-ascend support quantization method? @@ -84,9 +84,7 @@ Currently, w8a8 quantization is already supported by vllm-ascend originally on v ### 11. How to run w8a8 DeepSeek model? -Currently, w8a8 DeepSeek is working in process: [support AscendW8A8 quantization](https://github.com/vllm-project/vllm-ascend/pull/511) - -Please run DeepSeek with BF16 now, following the [Multi-Node DeepSeek inferencing tutorail](https://vllm-ascend.readthedocs.io/en/main/tutorials/multi_node.html) +Please following the [quantization inferencing tutorail](https://vllm-ascend.readthedocs.io/en/main/tutorials/multi_npu_quantization.html) and replace model to DeepSeek. ### 12. There is not output in log when loading models using vllm-ascend, How to solve it? diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md index 2bd83ffe7..126978f95 100644 --- a/docs/source/user_guide/graph_mode.md +++ b/docs/source/user_guide/graph_mode.md @@ -54,7 +54,7 @@ outputs = model.generate("Hello, how are you?") online example: ```shell -vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enable": True}}' +vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enable": true}}' ``` You can find more detail about additional config [here](./additional_config.md) diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md index 42a944f44..8f72b937e 100644 --- a/docs/source/user_guide/release_notes.md +++ b/docs/source/user_guide/release_notes.md @@ -1,5 +1,13 @@ # Release note +## v0.9.0rc2 - 2025.06.10 + +This release contains some quick fixes for v0.9.0rc1. Please use this release instead of v0.9.0rc1. + +### Highlights + +- Fix the import error when vllm-ascend is installed without editable way. [#1152](https://github.com/vllm-project/vllm-ascend/pull/1152) + ## v0.9.0rc1 - 2025.06.09 This is the 1st release candidate of v0.9.0 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. From this release, V1 Engine is recommended to use. The code of V0 Engine is frozen and will not be maintained any more. Please set environment `VLLM_USE_V1=1` to enable V1 Engine. From 7eb9f23825c353f199c11d9946c8e4c84e70b838 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 10 Jun 2025 17:18:09 +0800 Subject: [PATCH 05/28] [CI] Run e2e after pre check pass (#1132) Make sure the lint test passed before start the e2e test to save compute resource. Updated the patch doc to make sure the CI works as expect. Signed-off-by: wangxiyuan Signed-off-by: wangxiaoxin (A) --- .github/workflows/codespell.yml | 47 ---------- .github/workflows/mypy.yaml | 68 -------------- .github/workflows/ruff.yml | 48 ---------- .github/workflows/vllm_ascend_test.yaml | 65 +++++++++++++- .github/workflows/yapf.yml | 48 ---------- vllm_ascend/patch/__init__.py | 112 ++++++++++++++---------- 6 files changed, 128 insertions(+), 260 deletions(-) delete mode 100644 .github/workflows/codespell.yml delete mode 100644 .github/workflows/mypy.yaml delete mode 100644 .github/workflows/ruff.yml delete mode 100644 .github/workflows/yapf.yml diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml deleted file mode 100644 index a239d38fa..000000000 --- a/.github/workflows/codespell.yml +++ /dev/null @@ -1,47 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: codespell - -on: - pull_request: - branches: - - 'main' - - '*-dev' - -jobs: - codespell: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Run codespell check - run: | - CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**') - CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn') - - codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}" diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml deleted file mode 100644 index 81cd69afe..000000000 --- a/.github/workflows/mypy.yaml +++ /dev/null @@ -1,68 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: mypy - -on: - pull_request: - branches: - - 'main' - - '*-dev' - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - paths: - - '**/*.py' - - '.github/workflows/mypy.yaml' - - 'tools/mypy.sh' - - 'mypy.ini' - -jobs: - mypy: - runs-on: ubuntu-latest - strategy: - matrix: - # TODO(yikun): Add 3.12 back when torch-npu support 3.12 - python-version: ["3.9", "3.10", "3.11"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu - - - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v4 - with: - repository: vllm-project/vllm - path: vllm-empty - - - name: Install vllm-project/vllm from source - working-directory: vllm-empty - run: | - pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu - VLLM_TARGET_DEVICE=empty pip install . - - - name: Mypy - run: | - echo "::add-matcher::.github/workflows/matchers/mypy.json" - tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml deleted file mode 100644 index a19e6a89d..000000000 --- a/.github/workflows/ruff.yml +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: ruff - -on: - pull_request: - branches: - - 'main' - - '*-dev' - -jobs: - ruff: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Analysing the code with ruff - run: | - echo "::add-matcher::.github/workflows/matchers/ruff.json" - ruff check --output-format github . - - name: Run isort - run: | - isort . --check-only diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 49a85f5be..81f1bbe8f 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # -name: 'e2e test' +name: 'test' on: pull_request: @@ -29,6 +29,9 @@ on: - '!docs/**' - 'pytest.ini' - '!benchmarks/**' + - 'tools/mypy.sh' + - 'mypy.ini' + # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. # It's used to activate ascend-toolkit environment variables. @@ -37,7 +40,65 @@ defaults: shell: bash -el {0} jobs: - test: + lint: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Run codespell check + run: | + CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**') + CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn') + + codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}" + - name: Analysing the code with ruff + run: | + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . + - name: Run isort + run: | + isort . --check-only + - name: Running yapf + run: | + python -m pip install --upgrade pip + pip install toml + pip install yapf==0.32.0 + yapf --diff --recursive . + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: vllm-empty + run: | + pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu + VLLM_TARGET_DEVICE=empty pip install . + + - name: Mypy Check + run: | + echo "::add-matcher::.github/workflows/matchers/mypy.json" + tools/mypy.sh 1 ${{ matrix.python-version }} + + e2e: + needs: [lint] + if: ${{ needs.lint.result == 'success' }} strategy: max-parallel: 2 matrix: diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml deleted file mode 100644 index 64497d156..000000000 --- a/.github/workflows/yapf.yml +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: yapf - -on: - pull_request: - branches: - - 'main' - - '*-dev' - paths: - - "**/*.py" - - .github/workflows/yapf.yml - -jobs: - yapf: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install toml - pip install yapf==0.32.0 - - name: Running yapf - run: | - yapf --diff --recursive . diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index ccf9bd9e0..4be92c23a 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -35,6 +35,17 @@ # -------------------------------- # * Platform Patch: # ================= +# ** File: platform/patch_0_9_0/patch_distributed.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.distributed.utils.stateless_init_torch_distributed_process_group()` +# Why: +# vllm distributed use gloo backend by default to initialize stateless process group, but we want to use hccl here +# How: +# Add hccl backend to the `stateless_init_torch_distributed_process_group` +# Related PR (if no, explain why): +# https://github.com/vllm-project/vllm/pull/18763 +# Future Plan: +# Remove this patch once vllm is upgraded to 0.9.1 # ** File: platform/patch_common/patch_distributed.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.distributed.parallel_state.destroy_model_parallel()` @@ -44,56 +55,40 @@ # platform owned `CoordinatorGroup` to make sure all the CoordinateGroup can be properly destroyed # How: # Call `vllm_ascend.distributed.parallel_state method `destroy_platform_model_parallel` to destroy all the `CoordinateGroup` -# Related PR (if no, explain why): no related PR, we want add this ability into vllm +# Related PR (if no, explain why): # Future Plan: # Remove those patch when vllm merged them -# 2. `vllm.distributed.stateless_init_torch_distributed_process_group()` +# 2. `vllm.v1.engine.core.DPEngineCoreProc._init_data_parallel` # Why: -# The stateless process group can not be initialized except from gloo and nccl backend, vllm-ascend -# needs to initialize its own stateless process group for communication, so we add the platform related -# call to the `stateless_init_torch_distributed_process_group`, to enable other platform which may support -# stateless process group initialize method +# There is some bug for ASCEND_RT_VISIBLE_DEVICES usage. # How: -# rewrite stateless_init_torch_distributed_process_group to judge if there is a stateless process group initialize -# method and call platform method `platform_register_backend` to initialize them -# Related PR (if no, explain why): no related PR, we want add this ability into vllm +# The ASCEND_RT_VISIBLE_DEVICES related code is dropped. +# Related PR (if no, explain why): +# No, this is a bug for vllm ascend # Future Plan: -# Remove those patch when vllm merged them -# 3. `ParallelConfig.get_next_dp_init_port` +# Remove this patch once ASCEND_RT_VISIBLE_DEVICES bug is fixed. +# 3. `vllm.config.ParallelConfig.get_next_dp_init_port` # Why: -# We want to get dp port from env variable, so the multi-node inference can be properly initialized and run. +# vllm doesn't support get port from environment. # How: -# Get the dp port from env variable enable multi-mode dp inference -# Related PR (if no, explain why): no related PR, we want add this ability into vllm +# Add the logic to get port from environment. +# Related PR (if no, explain why): +# Need a PR to vllm to support get port from environment. # Future Plan: -# Its a workaround in vllm-ascend to enable multi-node dp inference, maybe removed if vllm have better plan -# on multi-node dp inference implementation -# 4. `ParallelConfig.stateless_init_dp_group` +# Remove those patch when vllm merged them +# 4. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group` # Why: # vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to # get better performance # How: -# adopt nccl backend to init process group -# Related PR (if no, explain why): no related PR, we want add this ability into vllm +# adopt nccl backend to init process group.(Now we still use gloo, it's just a placeholder, we'll use nccl in the future) +# Related PR (if no, explain why): +# Need a PR to vllm to support more backend. # Future Plan: -# Remove those patch when vllm merged them -# +# Remove those patch when vllm support more backend. # # * Worker Patch: # =============== -# ** File: worker/patch_common/patch_metrics.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.spec_decode.metrics.AsyncMetricsCollector.maybe_collect_rejsample_metrics` -# Why: -# There are cuda hard code (current_platform.is_cuda_alike()) in -# `AsyncMetricsCollector.maybe_collect_rejsample_metrics` -# How: -# Change to use `current_platform.Event` to determine whether to return None -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... -# https://github.com/vllm-project/vllm/pull/14411 -# Future Plan: -# Revert it when the related pr is merged in vllm. -# # ** File: worker/patch_common/patch_minicpm.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.minicpm.MiniCPMAttention.forward` @@ -103,7 +98,7 @@ # However float32 is not supported in cann rope op, thus we keep this patch # How: # Removed the dtype convert operations in forward -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # NO, only for npu due to rope op. # Future Plan: # Keep this patch in vllm-ascend. @@ -119,7 +114,7 @@ # - support attention metadata register to the set supported spec decode # - offer a api in platform to determine whether spec decode is supported, # and deprecate is_cuda_alike in it. -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm/pull/15195 # - https://github.com/vllm-project/vllm-ascend/pull/395 # Future Plan: @@ -131,14 +126,14 @@ # vLLM `Remove Sampler from Model Code` so vllm-ascend needs adapt to this change. # How: # Use vLLM 0.8.4 method to patch it. -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm/pull/15195 # - https://github.com/vllm-project/vllm-ascend/pull/395 # Future Plan: # Remove it when we identify the reasons clearly. # # ** File: worker/patch_common/patch_spec_decode_worker.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker` # Why: # We need to use the patched `TP1DraftModelRunner` in `SpecDecodeWorker.create_worker`. @@ -146,14 +141,14 @@ # `FlashAttentionMetadata` # How: # ditto -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm/pull/15195 # - https://github.com/vllm-project/vllm-ascend/pull/395 # Future Plan: # Revert it when the related pr is merged in vllm and vllm-ascend. # # ** File: worker/patch_common/patch_eagle.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.spec_decode.eagle.prepare_inputs` # Why: # We need to use the patched `prepare_input_kernel` in `eagle.prepare_inputs`. @@ -161,12 +156,12 @@ # kernel, ascend is now not support triton kernel. # How: # Re-implementation the `prepare_input_kernel` triton kernel by pytorch -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - Ascend doesn't support triton # Future Plan: # Revert it when the ascend support triton kernel. # -# ** File: v1/sample/sampler.py ** +# ** File: worker/patch_common/patch_sampler.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.sample.sampler.Sampler.apply_top_k_top_p` # Why: @@ -175,21 +170,44 @@ # to improve performance. # How: # Re-implementation the `apply_top_k_top_p` function by pytorch -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm-ascend/pull/970 # Future Plan: # Revert it when the ascend scatter performance improves. # -# ** File: v1/sample/sampler.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~s -# 1. `vllm.v1.sample.sampler.Sampler.apply_min_p` +# 2. `vllm.v1.sample.sampler.Sampler.apply_min_p` # Why: # We need to use the patched `apply_min_p` in `sample`. # The mainly reason to overwrite `apply_min_p` is # to improve performance. # How: # Re-implementation the `apply_min_p` function by pytorch -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm-ascend/pull/970 # Future Plan: # Revert it when the ascend indexput performance improves. +# +# ** File: worker/patch_common/patch_distributed.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.distributed.parallel_state.GroupCoordinator` +# Why: +# vllm doesn't support all_to_all for GroupCoordinator. +# How: +# Add all_to_all implementation for GroupCoordinator. +# Related PR (if no, explain why): +# Need a PR to vllm to support all_to_all for GroupCoordinator. +# Future Plan: +# Remove this patch when vllm merged them. +# +# ** File: worker/patch_common/patch_utils.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.utils.direct_register_custom_op` +# Why: +# pytorch 2.7.o is not compatible with pytorch 2.5.1. While vllm is based on pytorch 2.7.0, but vllm ascend +# is based on pytorch 2.5.1, so we need to use this patch to make vllm compatible with pytorch 2.5.1. +# How: +# patch __annotations__ check to make it compatible with pytorch 2.5.1. +# Related PR (if no, explain why): +# This is the problem in vllm-ascend +# Future Plan: +# Remove this patch once pytorch 2.7.0 is supported for vllm ascend. From 9861dc5e6310c0236749f3c6a2765f3ec8e355b0 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Tue, 10 Jun 2025 22:26:53 +0800 Subject: [PATCH 06/28] [MLA][Graph] Improve assertion on Graph mode with MLA (#933) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Improve assertion on Graph mode with MLA. When running deepseek with graph mode, the fused MLA op only support `numHeads / numKvHeads ∈ {32, 64, 128}`, thus we improve the assertion info here to avoid users confused with this. ### Does this PR introduce _any_ user-facing change? Adjusting tp size is required when running deepseek-v3/r1 with graph mode. deepseek-v2-lite is not supported in graph mode. ### How was this patch tested? Test locally as the CI machine could not run V3 due to the HBM limits. --------- Signed-off-by: MengqingCao Signed-off-by: wangxiaoxin (A) --- docs/source/faqs.md | 10 ++++++++++ vllm_ascend/attention/attention.py | 11 +++++++++++ vllm_ascend/attention/mla_v1.py | 11 +++++++++++ vllm_ascend/worker/multi_step_worker.py | 2 +- 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/docs/source/faqs.md b/docs/source/faqs.md index 1d355b5e9..1de3befb2 100644 --- a/docs/source/faqs.md +++ b/docs/source/faqs.md @@ -113,3 +113,13 @@ In scenarios where NPUs have limited HBM (High Bandwidth Memory) capacity, dynam - **Adjust `--gpu-memory-utilization`**: If unspecified, will use the default value of `0.9`. You can decrease this param to reserve more memory to reduce fragmentation risks. See more note in: [vLLM - Inference and Serving - Engine Arguments](https://docs.vllm.ai/en/latest/serving/engine_args.html#vllm.engine.arg_utils-_engine_args_parser-cacheconfig). - **Configure `PYTORCH_NPU_ALLOC_CONF`**: Set this environment variable to optimize NPU memory management. For example, you can `export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True` to enable virtual memory feature to mitigate memory fragmentation caused by frequent dynamic memory size adjustments during runtime, see more note in: [PYTORCH_NPU_ALLOC_CONF](https://www.hiascend.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html). + +### 15. Failed to enable NPU graph mode when running DeepSeek? +You may encounter the following error if running DeepSeek with NPU graph mode enabled. The allowed number of queries per kv when enabling both MLA and Graph mode only support {32, 64, 128}, **Thus this is not supported for DeepSeek-V2-Lite**, as it only has 16 attention heads. The NPU graph mode support on DeepSeek-V2-Lite will be done in the future. + +And if you're using DeepSeek-V3 or DeepSeek-R1, please make sure after the tensor parallel split, num_heads / num_kv_heads in {32, 64, 128}. + +```bash +[rank0]: RuntimeError: EZ9999: Inner Error! +[rank0]: EZ9999: [PID: 62938] 2025-05-27-06:52:12.455.807 numHeads / numKvHeads = 8, MLA only support {32, 64, 128}.[FUNC:CheckMlaAttrs][FILE:incre_flash_attention_tiling_check.cc][LINE:1218] +``` diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py index 8f130e424..a567cc530 100644 --- a/vllm_ascend/attention/attention.py +++ b/vllm_ascend/attention/attention.py @@ -40,6 +40,8 @@ from vllm_ascend.worker.model_runner import ( ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata) +_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128] + def generate_attn_mask(max_seq_len: int, dtype=torch.float16, mask_value=None): # Construct lower triangle matrix. @@ -1005,6 +1007,15 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + # TODO: support numHeads / numKvHeads < 16 in MLA kernel + if self.torchair_graph_enabled: + assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \ + ("The allowed number of queries per kv when enabling both MLA and Graph mode" + " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite," + " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1," + " please make sure after the tensor parallel split, num_heads / num_kv_heads in " + "{32, 64, 128}.") + def exec_kv( self, hidden_states: torch.Tensor, diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 70fb44df8..d9f20cc23 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -14,6 +14,7 @@ from vllm.utils import cdiv, round_down from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig from vllm_ascend.multistream.context import get_multistream_comm_context @@ -551,6 +552,7 @@ def __init__( self.o_proj = kwargs['o_proj'] self.kv_a_proj_with_mqa = kwargs.get('kv_a_proj_with_mqa', None) self.kv_a_layernorm = kwargs.get('kv_a_layernorm', None) + self.num_queries_per_kv = self.num_heads // self.num_kv_heads ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled @@ -560,6 +562,15 @@ def __init__( self.spec_token_num = speculative_config.num_speculative_tokens assert self.spec_token_num > 0 + # TODO: support numHeads / numKvHeads < 16 in MLA kernel + if self.torchair_graph_enabled: + assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \ + ("The allowed number of queries per kv when enabling both MLA and Graph mode" + " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite," + " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1," + " please make sure after the tensor parallel split, num_heads / num_kv_heads in " + "{32, 64, 128}.") + def _v_up_proj_and_o_proj(self, x): # Convert from (B, N, L) to (N, B, L) x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) diff --git a/vllm_ascend/worker/multi_step_worker.py b/vllm_ascend/worker/multi_step_worker.py index ba83f6b96..6d092805d 100644 --- a/vllm_ascend/worker/multi_step_worker.py +++ b/vllm_ascend/worker/multi_step_worker.py @@ -119,7 +119,7 @@ def _prepare_last_sampled_token_ids_for_tp_workers( # execute_model_req assert execute_model_req.last_sampled_token_ids is not None model_input.last_sampled_token_ids = ( - execute_model_req.last_sampled_token_ids.cuda()) + execute_model_req.last_sampled_token_ids.npu()) model_input.add_sampler_output( SamplerOutput(outputs=[], sampled_token_ids=None), model_input.last_sampled_token_ids) From cf419aac7438b51753cfad375e9d0e73bfc836fb Mon Sep 17 00:00:00 2001 From: 22dimensions Date: Wed, 11 Jun 2025 06:18:32 +0800 Subject: [PATCH 07/28] [CI] rename Qwen2.5-0.5B-Instruct-W8A8 model (#1145) 1. rename vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new to vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8 Signed-off-by: 22dimensions Signed-off-by: wangxiaoxin (A) --- tests/singlecard/test_offline_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py index 006883863..572649f9a 100644 --- a/tests/singlecard/test_offline_inference.py +++ b/tests/singlecard/test_offline_inference.py @@ -39,7 +39,7 @@ MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"] QUANTIZATION_MODELS = [ - "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new", + "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8", ] os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" From b97e79c943742acc2be2178388b07dcef3423f06 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Wed, 11 Jun 2025 07:31:13 +0800 Subject: [PATCH 08/28] [CI] Skip test_v1_spec_decode.py::test_ngram_correctness to make longterm CI pass (#1163) [CI] Skip test_v1_spec_decode.py::test_ngram_correctness to make longterm CI pass Related: https://github.com/vllm-project/vllm-ascend/issues/1162 Signed-off-by: MengqingCao Signed-off-by: wangxiaoxin (A) --- .github/workflows/vllm_ascend_test_long_term.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index 8afb95048..11144280c 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -94,7 +94,8 @@ jobs: if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then # spec decode test VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py - VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_spec_decode.py + # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed + # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_spec_decode.py VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process pytest -sv tests/long_term/spec_decode --ignore=tests/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py pytest -sv tests/long_term/test_accuracy.py From 87a74bbaa00be2e296728572658f2dd81842eaeb Mon Sep 17 00:00:00 2001 From: sdmyzlp <117554856+sdmyzlp@users.noreply.github.com> Date: Wed, 11 Jun 2025 09:18:38 +0800 Subject: [PATCH 09/28] Support multistream of shared experts in FusedMoE (#997) Contains on #1111 for completeness. ### What this PR does / why we need it? Implement multi-stream parallelism for MoE layers with shared experts, where computation of shared experts will be overlapped with expert token dispatch and combine. Also, when multi-stream is enabled, weights of shared experts will be force to replicate across all cards, regardless of any tensor parallelism configurations, to avoid AllReduce operations. With the expected overlaping being: ``` | shared gate_up | shared act | | shared down | | dispatch | routed gate_up, act, down | combine | ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Tested on 1x16 910 node, with tailored 2 layer DSKv2. --------- Signed-off-by: sdmyzlp Signed-off-by: wangxiaoxin (A) --- .github/workflows/vllm_ascend_test.yaml | 2 + docs/source/user_guide/additional_config.md | 4 +- mypy.ini | 3 + .../test_offline_inference_distributed.py | 19 +- tests/singlecard/test_ascend_config.py | 4 +- vllm_ascend/ascend_config.py | 4 +- vllm_ascend/models/deepseek_dbo.py | 110 +-------- vllm_ascend/models/deepseek_v2.py | 217 +++++++++++------- vllm_ascend/ops/fused_moe.py | 77 ++++--- vllm_ascend/quantization/w8a8_dynamic.py | 142 +++++------- vllm_ascend/utils.py | 22 +- 11 files changed, 296 insertions(+), 308 deletions(-) diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 81f1bbe8f..aaaa3b753 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -186,6 +186,7 @@ jobs: VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py fi @@ -216,5 +217,6 @@ jobs: VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py fi diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index 51b8e5431..76bac94d0 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -40,11 +40,11 @@ The details of each config option are as follows: | Name | Type | Default | Description | | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable torchair graph mode | +| `enable_multistream_moe`| bool | `False` | Whether to enable multistream shared expert | | `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization | | `use_cached_graph` | bool | `False` | Whether to use cached graph | | `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache | | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty | -| `enable_multistream_shared_expert`| bool | `False` | Whether to enable multistream shared expert | **ascend_scheduler_config** @@ -65,7 +65,7 @@ A full example of additional configuration is as follows: "use_cached_graph": true, "graph_batch_sizes": [1, 2, 4, 8], "graph_batch_sizes_init": false, - "enable_multistream_shared_expert": false + "enable_multistream_moe": false }, "ascend_scheduler_config": { "enabled": true, diff --git a/mypy.ini b/mypy.ini index 72b03de21..6fe8e6c29 100644 --- a/mypy.ini +++ b/mypy.ini @@ -6,6 +6,9 @@ warn_unused_configs = True [mypy-torch_npu.*] ignore_missing_imports = True +[mypy-torchair.*] +ignore_missing_imports = True + [mypy-transformers.*] ignore_missing_imports = True diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py index dc02c4b97..f5ec2c872 100644 --- a/tests/multicard/test_offline_inference_distributed.py +++ b/tests/multicard/test_offline_inference_distributed.py @@ -23,7 +23,7 @@ import os from unittest.mock import patch -import vllm # noqa: F401 +from modelscope import snapshot_download # type: ignore from vllm import SamplingParams from tests.conftest import VllmRunner @@ -95,3 +95,20 @@ def test_models_distributed_DeepSeek_dbo(): distributed_executor_backend="mp", ) as vllm_model: vllm_model.generate(example_prompts, sampling_params) + + +def test_models_distributed_DeepSeek_W8A8(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + + with VllmRunner( + snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"), + max_model_len=8192, + enforce_eager=True, + dtype="auto", + tensor_parallel_size=4, + quantization="ascend", + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py index 484fe5f70..818745f30 100644 --- a/tests/singlecard/test_ascend_config.py +++ b/tests/singlecard/test_ascend_config.py @@ -58,7 +58,7 @@ def test_run_with_ascend_config(): "use_cached_graph": True, "graph_batch_sizes": [1, 2, 4, 8], "graph_batch_sizes_init": False, - "enable_multistream_shared_expert": True, + "enable_multistream_moe": True, }, "ascend_scheduler_config": { "enabled": True, @@ -79,7 +79,7 @@ def test_run_with_ascend_config(): 1, 2, 4, 8 ] assert not ascend_config.torchair_graph_config.graph_batch_sizes_init - assert ascend_config.torchair_graph_config.enable_multistream_shared_expert + assert ascend_config.torchair_graph_config.enable_multistream_moe assert ascend_config.ascend_scheduler_config.enabled assert ascend_config.ascend_scheduler_config.enable_chunked_prefill assert ascend_config.expert_tensor_parallel_size == 1 diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index d42f145e4..2f9e03dff 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -56,8 +56,8 @@ def __init__(self, torchair_graph_config): "graph_batch_sizes", []) self.graph_batch_sizes_init = torchair_graph_config.get( "graph_batch_sizes_init", False) - self.enable_multistream_shared_expert = torchair_graph_config.get( - "enable_multistream_shared_expert", False) + self.enable_multistream_moe = torchair_graph_config.get( + "enable_multistream_moe", False) self.enable_view_optimize = torchair_graph_config.get( "enable_view_optimize", True) diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py index c3de6ae93..9db49cbff 100644 --- a/vllm_ascend/models/deepseek_dbo.py +++ b/vllm_ascend/models/deepseek_dbo.py @@ -29,7 +29,7 @@ import torch import torch.distributed as dist -import torch_npu +import torch_npu # noqa: F401 import vllm.envs as envs from torch import nn from transformers import PretrainedConfig @@ -40,13 +40,10 @@ get_tp_group, tensor_model_parallel_all_reduce) from vllm.distributed.parallel_state import get_dp_group from vllm.forward_context import get_forward_context -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, ReplicatedLinear, - RowParallelLinear, - UnquantizedLinearMethod) + RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope @@ -67,6 +64,7 @@ import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2MLP from vllm_ascend.multistream.base import MSEventKey from vllm_ascend.multistream.context import ( advance_step_multistream_layer_context, get_multistream_comm_context, @@ -78,117 +76,17 @@ make_multistream_metadata_ds) from vllm_ascend.multistream.ms_split import compute_split_seq_index from vllm_ascend.ops.fused_moe import AscendFusedMoE -from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod from vllm_ascend.utils import dispose_tensor VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 -class CustomDeepseekDBOMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - quant_config: Optional[QuantizationConfig] = None, - reduce_results: bool = True, - prefix: str = "", - ) -> None: - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config, - reduce_results=reduce_results, - prefix=f"{prefix}.down_proj") - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - # NOTE: `torch_npu.npu_dequant_swiglu_quant` can only be enabled in dynamic quant - self.is_dynamic_quant = not isinstance( - self.gate_up_proj.quant_method, - UnquantizedLinearMethod) and isinstance( - self.gate_up_proj.quant_method.quant_method, - AscendW8A8DynamicLinearMethod) - - def forward(self, x): - if self.is_dynamic_quant: - x, dynamic_scale = torch_npu.npu_dynamic_quant(x) - x = torch_npu.npu_quant_matmul( - x, - self.gate_up_proj.weight, - self.gate_up_proj.weight_scale, - output_dtype=torch.int32, - ) - x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant( - x=x, - weight_scale=self.gate_up_proj.weight_scale_fp32, - activation_scale=dynamic_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=None, - activate_left=True, - quant_mode=1) - x = torch_npu.npu_quant_matmul( - x, - self.down_proj.weight, - self.down_proj.weight_scale, - pertoken_scale=dynamic_scale, - output_dtype=torch.bfloat16, - ) - if self.down_proj.reduce_results and self.down_proj.tp_size > 1: - x = tensor_model_parallel_all_reduce(x) - return x - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x +class CustomDeepseekDBOMLP(CustomDeepseekV2MLP): def _forward_ms_mlp(self, x): current_ms_metadata = get_multistream_comm_context() assert current_ms_metadata is not None - if self.is_dynamic_quant: - x, dynamic_scale = torch_npu.npu_dynamic_quant(x) - x = torch_npu.npu_quant_matmul( - x, - self.gate_up_proj.weight, - self.gate_up_proj.weight_scale, - output_dtype=torch.int32, - ) - x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant( - x=x, - weight_scale=self.gate_up_proj.weight_scale_fp32, - activation_scale=dynamic_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=None, - activate_left=True, - quant_mode=1) - x = torch_npu.npu_quant_matmul( - x, - self.down_proj.weight, - self.down_proj.weight_scale, - pertoken_scale=dynamic_scale, - output_dtype=torch.bfloat16, - ) - if self.down_proj.reduce_results and self.down_proj.tp_size > 1: - current_ms_metadata.before_comm_event.record() - with torch.npu.stream(current_ms_metadata.comm_stream): - current_ms_metadata.before_comm_event.wait() - x = tensor_model_parallel_all_reduce(x) - current_ms_metadata.after_comm_event.record() - return x gate_up, _ = self.gate_up_proj(x) x = self.act_fn(gate_up) current_ms_metadata.before_comm_event.record() diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 96c76338b..a83ca4751 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -25,7 +25,7 @@ # # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py # """Inference-only DeepseekV2/DeepseekV3 model.""" -from typing import Any, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch import torch.distributed as dist @@ -69,12 +69,73 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_ep_group from vllm_ascend.ops.fused_moe import AscendFusedMoE +from vllm_ascend.quantization.quant_config import AscendLinearMethod from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod from vllm_ascend.utils import dispose_tensor VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 +class CustomDeepseekV2SiluAndMul(SiluAndMul): + + def __init__(self, + *, + weight_scale: Optional[Callable[[], torch.Tensor]] = None): + super().__init__() + self.weight_scale = weight_scale + + def forward_oot(self, x: Union[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]): + if isinstance(x, tuple): + assert self.weight_scale is not None + # For AscendW8A8DynamicLinearMethod: + # a dynamic scale is passed along with the quantized value. + quantized_x, dynamic_scale = x + return torch_npu.npu_dequant_swiglu_quant( + x=quantized_x, + weight_scale=self.weight_scale(), + activation_scale=dynamic_scale, + activate_left=True, + quant_mode=1) + else: + return super().forward_oot(x) + + +class CustomDeepseekV2MergedReplicatedLinear(ReplicatedLinear): + + def __init__( + self, + input_size: int, + output_sizes: list[int], + bias: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + self.output_sizes = output_sizes + super().__init__(input_size, + sum(output_sizes), + bias=bias, + quant_config=quant_config, + prefix=prefix) + + def weight_loader(self, param: torch.nn.Parameter, + loaded_weight: torch.Tensor, loaded_shard_id: int): + # With no support for GGUF format yet. + assert not getattr(param, "is_gguf_weight", False) + assert not getattr(param, "is_gguf_weight_type", False) + + assert loaded_shard_id < len(self.output_sizes) + shard_offset = sum(self.output_sizes[:loaded_shard_id]) + shard_size = self.output_sizes[loaded_shard_id] + shard = param.data.narrow(param.output_dim, shard_offset, shard_size) + + assert shard.size() == loaded_weight.size(), ( + f"Tried to load weights of size {loaded_weight.size()}" + f"to a parameter shard of id {loaded_shard_id} size {shard.size()}" + ) + shard.copy_(loaded_weight) + + class CustomDeepseekV2MLP(nn.Module): def __init__( @@ -84,61 +145,68 @@ def __init__( hidden_act: str, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True, + force_replicate: bool = False, prefix: str = "", ) -> None: super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config, - reduce_results=reduce_results, - prefix=f"{prefix}.down_proj") + if not force_replicate: + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") + else: + self.gate_up_proj = CustomDeepseekV2MergedReplicatedLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = ReplicatedLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") - self.act_fn = SiluAndMul() - # NOTE: `torch_npu.npu_dequant_swiglu_quant` can only be enabled in dynamic quant - self.is_dynamic_quant = not isinstance( - self.gate_up_proj.quant_method, - UnquantizedLinearMethod) and isinstance( - self.gate_up_proj.quant_method.quant_method, - AscendW8A8DynamicLinearMethod) + quant_method = self.gate_up_proj.quant_method + if isinstance(quant_method, UnquantizedLinearMethod): + self.act_fn = CustomDeepseekV2SiluAndMul() + elif (isinstance(quant_method, AscendLinearMethod) and isinstance( + quant_method.quant_method, AscendW8A8DynamicLinearMethod)): + # TODO(sdmyzlp): Currently preserved as before: + # 1. The only quantization supported for silu is W8A8Dynamic + # 2. Output dtype of gate_up/down is fixed to be int32/bfloat16 + # + # Maybe one can implement a better and more general configuration + # scheme, e.g. by somehow passing around the tweaked `quant_config` + self.act_fn = CustomDeepseekV2SiluAndMul( + # Use lazy binding, for `weight_scale_fp32` is accessible + # only after `process_weights_after_loading`. + weight_scale=lambda: self.gate_up_proj.weight_scale_fp32) + # To be consumed by AscendW8A8DynamicLinearMethod.apply() + self.gate_up_proj._ascend_quant_config = { + "output_dtype": torch.int32, + "pertoken_scale": False, + "return_scale": True, + } + self.down_proj._ascend_quant_config = { + "output_dtype": torch.bfloat16, + "pertoken_scale": True, + "return_scale": False, + } + else: + raise NotImplementedError( + f"Quantization with [{type(quant_method)}] is NOT supported") def forward(self, x): - if self.is_dynamic_quant: - x, dynamic_scale = torch_npu.npu_dynamic_quant(x) - x = torch_npu.npu_quant_matmul( - x, - self.gate_up_proj.weight, - self.gate_up_proj.weight_scale, - output_dtype=torch.int32, - ) - x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant( - x=x, - weight_scale=self.gate_up_proj.weight_scale_fp32, - activation_scale=dynamic_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=None, - activate_left=True, - quant_mode=1) - x = torch_npu.npu_quant_matmul( - x, - self.down_proj.weight, - self.down_proj.weight_scale, - pertoken_scale=dynamic_scale, - output_dtype=torch.bfloat16, - ) - if self.down_proj.reduce_results and self.down_proj.tp_size > 1: - x = tensor_model_parallel_all_reduce(x) - return x gate_up, _ = self.gate_up_proj(x) x = self.act_fn(gate_up) x, _ = self.down_proj(x) @@ -169,6 +237,12 @@ def __init__( raise ValueError(f"Unsupported activation: {config.hidden_act}. " "Only silu is supported for now.") + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on + self.enable_multistream_moe = \ + ascend_config.torchair_graph_config.enable_multistream_moe and VLLM_ENABLE_MC2 + self.gate = ReplicatedLinear(config.hidden_size, config.n_routed_experts, bias=False, @@ -204,8 +278,11 @@ def __init__( hidden_act=config.hidden_act, quant_config=quant_config, reduce_results=True, + force_replicate=self.enable_multistream_moe, prefix=f"{prefix}.shared_experts", ) + else: + self.shared_experts = None # type: ignore CustomDeepseekV2MoE.top_k = config.num_experts_per_tok self.dp_size = get_dp_group().world_size @@ -216,12 +293,6 @@ def __init__( self.params_dtype = torch.get_default_dtype() - ascend_config = get_ascend_config() - self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on - self.enable_multistream_shared_expert = \ - ascend_config.torchair_graph_config.enable_multistream_shared_expert and VLLM_ENABLE_MC2 - def forward( self, hidden_states: torch.Tensor, @@ -240,12 +311,10 @@ def forward( enable_force_load_balance = False if hasattr(attn_metadata, 'with_prefill_across_dp'): is_prefill = is_prefill or attn_metadata.with_prefill_across_dp - num_tokens, hidden_size = hidden_states.shape - - multistream = self.enable_multistream_shared_expert and not is_prefill - - old_hidden_states = hidden_states.clone() + old_hidden_states = hidden_states + use_separated_shared_experts = (self.shared_experts is not None + and not self.enable_multistream_moe) if self.tp_size > 1: if (VLLM_ENABLE_MC2 @@ -262,25 +331,22 @@ def forward( # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - kwargs = {} - if multistream: - kwargs.update({ - "shared_experts": self.shared_experts, - "shared_hidden_states": old_hidden_states - }) - - hidden_states = self.experts( + experts_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits, is_prefill=is_prefill, top_k=CustomDeepseekV2MoE.top_k, enable_force_load_balance=enable_force_load_balance, - **kwargs) - - if multistream: - hidden_states, shared_output = hidden_states + shared_experts=(self.shared_experts + if not use_separated_shared_experts else None), + ) - hidden_states = hidden_states * self.routed_scaling_factor + if not isinstance(experts_hidden_states, tuple): + hidden_states = experts_hidden_states * self.routed_scaling_factor + else: + hidden_states = ( + experts_hidden_states[0] * self.routed_scaling_factor + + experts_hidden_states[1]) if self.tp_size > 1: if (VLLM_ENABLE_MC2 @@ -294,12 +360,9 @@ def forward( else: hidden_states = tensor_model_parallel_all_reduce(hidden_states) - if self.n_shared_experts is not None: - if not multistream: - shared_output = self.shared_experts(old_hidden_states) - - if shared_output is not None: - hidden_states = hidden_states + shared_output + if use_separated_shared_experts: + hidden_states = hidden_states + self.shared_experts( + old_hidden_states) return hidden_states.view(num_tokens, hidden_size) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 25f3b05d5..d6115d35c 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -16,7 +16,7 @@ # Adapted from vllm/tests/kernels/test_moe.py import os -from typing import Callable, List, Optional +from typing import Any, Callable, List, Optional, Tuple, Union import torch import torch.distributed as dist @@ -36,6 +36,7 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer +from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 USING_LCCL_COM: bool = envs_ascend.USING_LCCL_COM @@ -106,15 +107,17 @@ def process_topk_ids(topk_ids: torch.Tensor, expert_num: int, ep_size: int, return topk_ids_pad, unpad_indices -def fused_experts_with_mc2(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - top_k: int, - expert_map: torch.Tensor = None, - moe_all_to_all_group_name: Optional[str] = None, - **kwargs) -> torch.Tensor: +def fused_experts_with_mc2( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None, + moe_all_to_all_group_name: Optional[str] = None, + shared_experts: Optional[Any] = None +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: global_bs = 0 moe_expert_num = len(expert_map) kwargs_mc2 = { @@ -154,6 +157,13 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[ 0:5] + if shared_experts is not None: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(hidden_states, topk_weights) + shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states) + npu_wait_tensor(shared_gate_up, expand_x) + shared_act = shared_experts.act_fn(shared_gate_up) + w1 = w1.transpose(1, 2) group_list = expert_token_nums.to(torch.int64) @@ -210,7 +220,13 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2) - return hidden_states + if shared_experts is None: + return hidden_states + else: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(shared_act, down_out_list) + shared_hidden_states, _ = shared_experts.down_proj(shared_act) + return hidden_states, shared_hidden_states def apply_mlp(hidden_states_wrapper: List[torch.Tensor], @@ -875,6 +891,7 @@ def apply( e_score_correction_bias: Optional[torch.Tensor] = None, is_prefill: bool = False, enable_force_load_balance: bool = False, + shared_experts: Optional[Any] = None, **kwargs, ) -> torch.Tensor: @@ -924,7 +941,7 @@ def apply( top_k=top_k, expert_map=expert_map, moe_all_to_all_group_name=self.moe_all_to_all_group_name, - **kwargs) + shared_experts=shared_experts) elif self.torchair_graph_enabled or get_ep_group().world_size == 1: return fused_experts(hidden_states=x, w1=layer.w13_weight, @@ -1053,9 +1070,6 @@ def __init__( self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on - self.enable_multistream_shared_expert = \ - ascend_config.torchair_graph_config.enable_multistream_shared_expert and VLLM_ENABLE_MC2 if self.scoring_func != "softmax" and not self.use_grouped_topk: raise ValueError("Only softmax scoring function is supported for " @@ -1102,8 +1116,8 @@ def forward(self, router_logits: torch.Tensor, is_prefill: bool, enable_force_load_balance: bool = False, - top_k=None, - **kwargs): + top_k: Optional[int] = None, + shared_experts: Optional[Any] = None): assert self.quant_method is not None if top_k: @@ -1132,7 +1146,7 @@ def forward(self, hidden_states, router_logits) # Matrix multiply. - hidden_states = self.quant_method.apply( + e_hidden_states = self.quant_method.apply( layer=self, x=hidden_states, router_logits=router_logits, @@ -1150,36 +1164,39 @@ def forward(self, enable_force_load_balance=enable_force_load_balance, log2phy=self.log2phy, global_redundant_expert_num=self.global_redundant_expert_num, - **kwargs) + shared_experts=shared_experts, + ) - if self.enable_multistream_shared_expert and not is_prefill: - hidden_states, shared_output = hidden_states + if shared_experts is not None: + # Provide dummy implementation of "non-separated" shared experts. + if not isinstance(e_hidden_states, tuple): + return e_hidden_states, shared_experts(hidden_states) + else: + return e_hidden_states if self.dp_size > 1: if VLLM_ENABLE_MC2 and not is_prefill: ... elif self.torchair_graph_enabled: if USING_LCCL_COM: # type: ignore - hidden_states = dist._functional_collectives.reduce_scatter_tensor( - hidden_states, + e_hidden_states = dist._functional_collectives.reduce_scatter_tensor( + e_hidden_states, "sum", scatter_dim=0, group=get_dp_group().device_group) elif self.torchair_graph_enabled and not is_prefill: - hidden_states = dist._functional_collectives.reduce_scatter_tensor( - hidden_states, + e_hidden_states = dist._functional_collectives.reduce_scatter_tensor( + e_hidden_states, "sum", scatter_dim=0, group=get_dp_group().device_group) else: - hidden_states = get_ep_group().combine(hidden_states) + e_hidden_states = get_ep_group().combine(e_hidden_states) if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1): - hidden_states = tensor_model_parallel_all_reduce(hidden_states) + e_hidden_states = tensor_model_parallel_all_reduce(e_hidden_states) - if self.enable_multistream_shared_expert and not is_prefill: - return hidden_states, shared_output - return hidden_states + return e_hidden_states # ----------------------------------------- TBO-related -------------------------------------------- diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index c6e863ff7..66a0a302c 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -15,19 +15,19 @@ # limitations under the License. # -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, Optional, Tuple, Union import torch import torch.distributed as dist import torch_npu -import torchair as tng # type: ignore -from vllm.distributed import GroupCoordinator, tensor_model_parallel_all_reduce +from vllm.distributed import GroupCoordinator import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_ep_group from vllm_ascend.ops.fused_moe import select_experts -from vllm_ascend.utils import dispose_tensor +from vllm_ascend.utils import (dispose_tensor, npu_stream_switch, + npu_wait_tensor) VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 @@ -39,8 +39,7 @@ def apply_mlp(hidden_states: torch.Tensor, w2_scale: torch.Tensor, group_list: torch.Tensor, dynamic_scale: torch.Tensor = None, - group_list_type: int = 1, - **kwargs) -> torch.Tensor: + group_list_type: int = 1) -> torch.Tensor: """ apply MLP: gate_up_proj -> swiglu -> down_proj @@ -74,23 +73,6 @@ def apply_mlp(hidden_states: torch.Tensor, else: pertoken_scale = dynamic_scale - shared_experts = kwargs.get('shared_experts', None) - if shared_experts: - shared_gate_up = kwargs.get('shared_gate_up', None) - shared_dynamic_scale = kwargs.get('shared_dynamic_scale', None) - with tng.scope.npu_stream_switch('cv'): - tng.scope.npu_wait_tensor(shared_gate_up, hidden_states) - shared_x, shared_dynamic_scale = torch_npu.npu_dequant_swiglu_quant( - x=shared_gate_up, - weight_scale=shared_experts.gate_up_proj.weight_scale_fp32, - activation_scale=shared_dynamic_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=None, - activate_left=True, - quant_mode=1) - # gmm1: gate_up_proj hidden_states = torch_npu.npu_grouped_matmul( x=[hidden_states], @@ -120,36 +102,24 @@ def apply_mlp(hidden_states: torch.Tensor, group_list=group_list, output_dtype=w2_scale.dtype)[0] - if shared_experts: - with tng.scope.npu_stream_switch('cv'): - tng.scope.npu_wait_tensor(shared_x, hidden_states) - shared_output = torch_npu.npu_quant_matmul( - shared_x, - shared_experts.down_proj.weight, - shared_experts.down_proj.weight_scale, - pertoken_scale=shared_dynamic_scale, - output_dtype=torch.bfloat16, - ) - if shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1: - shared_output = tensor_model_parallel_all_reduce(shared_output) - if shared_experts: - return hidden_states, shared_output return hidden_states -def fused_experts_with_mc2(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - top_k: int, - expert_map: torch.Tensor = None, - moe_all_to_all_group_name: str = "", - log2phy: torch.Tensor = None, - global_redundant_expert_num: int = 0, - **kwargs) -> torch.Tensor: +def fused_experts_with_mc2( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None, + moe_all_to_all_group_name: str = "", + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: if log2phy: topk_ids = log2phy[topk_ids] global_bs = 0 @@ -188,31 +158,17 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, } kwargs_mc2.update(stage1_kwargs) - shared_experts = kwargs.get('shared_experts', None) - if shared_experts: - shared_hidden_states = kwargs.get('shared_hidden_states', None) - with tng.scope.npu_stream_switch('cv'): - tng.scope.npu_wait_tensor(shared_hidden_states, hidden_states) - shared_x, shared_dynamic_scale = torch_npu.npu_dynamic_quant( - shared_hidden_states) - shared_gate_up = torch_npu.npu_quant_matmul( - shared_x, - shared_experts.gate_up_proj.weight, - shared_experts.gate_up_proj.weight_scale, - output_dtype=torch.int32, - ) - kwargs.update({ - "shared_gate_up": shared_gate_up, - "shared_dynamic_scale": shared_dynamic_scale, - }) - output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2) # comm_stream.wait_stream(torch.npu.current_stream()) expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[ 0:5] - if quant_mode == 0: - dynamic_scale = None + if shared_experts is not None: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(hidden_states, topk_weights) + shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states) + npu_wait_tensor(shared_gate_up[0], expand_x) + shared_act = shared_experts.act_fn(shared_gate_up) # `expand_x` will be disposed in the `apply_mlp` function down_out_list = apply_mlp(expand_x, @@ -221,12 +177,7 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, w2, w2_scale, expert_token_nums, - dynamic_scale=dynamic_scale, - **kwargs) - - multi_stream = isinstance(down_out_list, tuple) - if multi_stream: - down_out_list, shared_output = down_out_list + dynamic_scale=dynamic_scale) # moeCombine kwargs_mc2 = { @@ -257,9 +208,13 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2) - if multi_stream: + if shared_experts is None: + return hidden_states + else: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(shared_act[0], down_out_list) + shared_output, _ = shared_experts.down_proj(shared_act) return hidden_states, shared_output - return hidden_states # currently expert parallelism implemented with all2all @@ -541,21 +496,33 @@ def get_perchannel_param( @staticmethod def apply( layer: torch.nn.Module, - x: torch.Tensor, + x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], bias: Optional[torch.Tensor] = None, tp_rank: Optional[int] = 0, ) -> torch.Tensor: - original_dtype = x.dtype - # use ATB quantize - quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x) - return torch_npu.npu_quant_matmul( - quant_out, + config = getattr(layer, "_ascend_quant_config", {}) + if not isinstance(x, tuple): + output_dtype = config.get("output_dtype", x.dtype) + quantized_x, dynamic_scale = torch_npu.npu_dynamic_quant(x) + else: + assert "output_dtype" in config.keys(), ( + f"DynamicLinearMethod needs explicitly specified `output_dtype`" + f"for pre-quantized input, got config [{config}]") + output_dtype = config["output_dtype"] + quantized_x, dynamic_scale = x + pertoken_scale = (dynamic_scale + if config.get("pertoken_scale", True) else None) + + output = torch_npu.npu_quant_matmul( + quantized_x, layer.weight, layer.weight_scale, - pertoken_scale=dynamic_scale, + pertoken_scale=pertoken_scale, bias=bias, - output_dtype=original_dtype, + output_dtype=output_dtype, ) + return ((output, dynamic_scale) + if config.get("return_scale", False) else output) def process_weights_after_loading(self, layer): if self.transpose_weight: @@ -650,6 +617,7 @@ def apply( enable_force_load_balance: bool = True, log2phy: torch.Tensor = None, global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, **kwargs, ) -> torch.Tensor: assert router_logits.shape[ @@ -706,7 +674,7 @@ def apply( moe_all_to_all_group_name=self.moe_all_to_all_group_name, log2phy=log2phy, global_redundant_expert_num=global_redundant_expert_num, - **kwargs) + shared_experts=shared_experts) elif self.torchair_graph_enabled or self.ep_group.world_size == 1: return fused_experts(hidden_states=x, w1=layer.w13_weight, diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 7d4093804..f41dab4b9 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -19,17 +19,26 @@ import atexit import math -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext from threading import Lock from typing import TYPE_CHECKING, List, Tuple import torch +import torchair # type: ignore[import] # noqa: F401 from packaging.version import InvalidVersion, Version from torch_npu.npu.streams import Event from vllm.logger import logger import vllm_ascend.envs as envs +try: + # Recent release of torchair has moved these ops to `.scope`. + from torchair.scope import npu_stream_switch as _npu_stream_switch + from torchair.scope import npu_wait_tensor as _npu_wait_tensor +except ImportError: + from torchair.ops import NpuStreamSwitch as _npu_stream_switch + from torchair.ops import npu_wait_tensor as _npu_wait_tensor + if TYPE_CHECKING: from vllm.config import VllmConfig else: @@ -227,3 +236,14 @@ def pop_captured_sync(self) -> dict: durations[tag] = observe_start.elapsed_time(observe_end) return durations + + +def npu_stream_switch(tag: str, priority: int, *, enabled: bool = True): + return _npu_stream_switch(tag, priority) if enabled else nullcontext() + + +def npu_wait_tensor(self: torch.Tensor, + dependency: torch.Tensor, + *, + enabled: bool = True): + return _npu_wait_tensor(self, dependency) if enabled else self From 5f896524051b2fd5bec038d1e6e4ad0e4e9fc36c Mon Sep 17 00:00:00 2001 From: depeng1994 <166494784+depeng1994@users.noreply.github.com> Date: Wed, 11 Jun 2025 10:02:11 +0800 Subject: [PATCH 10/28] provide an e2e guide for execute duration profiling (#1113) ### What this PR does / why we need it? provide an e2e guide for execute duration profiling Signed-off-by: depeng1994 Signed-off-by: wangxiaoxin (A) --- .../developer_guide/evaluation/profile_execute_duration.md | 5 +++++ vllm_ascend/worker/model_runner_v1.py | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/source/developer_guide/evaluation/profile_execute_duration.md b/docs/source/developer_guide/evaluation/profile_execute_duration.md index 8989bf9cf..3b37df6be 100644 --- a/docs/source/developer_guide/evaluation/profile_execute_duration.md +++ b/docs/source/developer_guide/evaluation/profile_execute_duration.md @@ -9,6 +9,11 @@ The execution duration of each stage (including pre/post-processing, model forwa * Use the non-blocking API `ProfileExecuteDuration().capture_async` to set observation points asynchronously when you need to observe the execution duration. * Use the blocking API `ProfileExecuteDuration().pop_captured_sync` at an appropriate time to get and print the execution durations of all observed stages. +**We have instrumented the key inference stages (including pre-processing, model forward pass, etc.) for execute duration profiling. Execute the script as follows:** +``` +VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE=1 python3 vllm-ascend/examples/offline_inference_npu.py +``` + ## Example Output ``` diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 18d9f3ac5..68f476f4c 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1066,8 +1066,8 @@ def execute_model( for tag, duration in durations.items() ] captured_name = "Decode" if self.attn_state == AscendAttentionState.DecodeOnly else "Prefill" - print(f"Profile execute duration [{captured_name}]:", - " ".join(dr_str)) + logger.info("Profile execute duration [%s]:%s", captured_name, + " ".join(dr_str)) return model_runner_output From 83263144edbabaa2143592193b649181552701e3 Mon Sep 17 00:00:00 2001 From: ttanzhiqiang <38750855+ttanzhiqiang@users.noreply.github.com> Date: Wed, 11 Jun 2025 10:40:50 +0800 Subject: [PATCH 11/28] etp best a2 (#1101) ### What this PR does / why we need it? Single machine 16 cards deepseekr1 attention (tp8/dp2) / moe(etp) Best performance rely on: vllm-ascend commit id:da9acfca6053352730fce75fb772e214755d0341 vllm commit id:b124e1085b1bf977e3dac96d99ffd9d8ddfdb6cc + https://github.com/vllm-project/vllm-ascend/pull/910 + [Reduce _npu_flash_attention mask to 128x128 for memory savings] https://github.com/vllm-project/vllm-ascend/pull/1100+ [Reduce memory usage by splitting tokens in fused_experts] --------- Signed-off-by: ttanzhiqiang <389825161@qq.com> Signed-off-by: wangxiaoxin (A) --- examples/run_dp_attention_etp16.sh | 23 +++++++++ examples/run_dp_attention_etp16_benmark.sh | 56 ++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 examples/run_dp_attention_etp16.sh create mode 100644 examples/run_dp_attention_etp16_benmark.sh diff --git a/examples/run_dp_attention_etp16.sh b/examples/run_dp_attention_etp16.sh new file mode 100644 index 000000000..b73649241 --- /dev/null +++ b/examples/run_dp_attention_etp16.sh @@ -0,0 +1,23 @@ +export VLLM_ENABLE_MC2=0 +export VLLM_USE_V1=1 +export TASK_QUEUE_ENABLE=1 +source /usr/local/Ascend/ascend-toolkit/set_env.sh +source /usr/local/Ascend/nnal/atb/set_env.sh +export ASCEND_LAUNCH_BLOCKING=0 +export VLLM_VERSION=0.9.0 + +nohup python -m vllm.entrypoints.openai.api_server --model=/mnt/deepseek/DeepSeek-R1-W8A8-VLLM \ + --quantization ascend \ + --trust-remote-code \ + --distributed-executor-backend=mp \ + --port 8006 \ + -tp=8 \ + -dp=2 \ + --max-num-seqs 24 \ + --max-model-len 32768 \ + --max-num-batched-tokens 32768 \ + --block-size 128 \ + --no-enable-prefix-caching \ + --additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes":[24]},"ascend_scheduler_config":{"enabled":true},"expert_tensor_parallel_size":16}' \ + --gpu-memory-utilization 0.96 &> run.log & +disown \ No newline at end of file diff --git a/examples/run_dp_attention_etp16_benmark.sh b/examples/run_dp_attention_etp16_benmark.sh new file mode 100644 index 000000000..ab72b3b28 --- /dev/null +++ b/examples/run_dp_attention_etp16_benmark.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Concurrency array +concurrency_array=(48) +#best rate +rate_array=(0.7) + +# Result file +result_file="benchmark_results.txt" +echo "Benchmark Results" > $result_file +echo "===================" >> $result_file + +# Loop through all combinations +for concurrency in "${concurrency_array[@]}"; do + for rate in "${rate_array[@]}"; do + echo "Testing with concurrency=$concurrency, rate=$rate" + echo "" >> $result_file + echo "Concurrency: $concurrency, Request Rate: $rate" >> $result_file + echo "-------------------" >> $result_file + + # Run benchmark test + python /mnt/deepseek/vllm/benchmarks/benchmark_serving.py \ + --backend vllm \ + --trust-remote-code \ + --model /mnt/deepseek/DeepSeek-R1-W8A8-VLLM \ + --dataset-name random \ + --random-input-len 4096 \ + --random-output-len 1536 \ + --ignore-eos \ + --num-prompts 400 \ + --max-concurrency $concurrency \ + --request-rate $rate \ + --metric-percentiles 90 \ + --base-url http://localhost:8006 2>&1 | tee -a $result_file + + # Wait for system cool down + sleep 30 + done +done + +# Analyze results +echo "Analysis Results" > analysis_results.txt +echo "=================" >> analysis_results.txt + +# Extract and analyze TPOT data +echo "TPOT Analysis:" >> analysis_results.txt +grep "Mean TPOT" $result_file | awk -F':' '{ + printf "Concurrency %s, Rate %s: %s ms\n", $1, $2, $3 +}' >> analysis_results.txt + +# Extract and analyze throughput data +echo -e "\nThroughput Analysis:" >> analysis_results.txt +grep "Output token throughput" $result_file | awk -F':' '{ + printf "Concurrency %s, Rate %s: %s tokens/s\n", $1, $2, $3 +}' >> analysis_results.txt + +echo "Testing completed. Results saved in $result_file and analysis in analysis_results.txt" From 933e26157393c3049364fbbb7c68b2a871c2d2b2 Mon Sep 17 00:00:00 2001 From: yz <43207690+yzim@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:03:37 +0800 Subject: [PATCH 12/28] [Doc] Fix the config parameter name "enable" in graph_mode.md. (#1159) Fix the doc typo in graph_mode.md Signed-off-by: yzim <43207690+yzim@users.noreply.github.com> Signed-off-by: wangxiaoxin (A) --- docs/source/user_guide/graph_mode.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md index 126978f95..6831809f2 100644 --- a/docs/source/user_guide/graph_mode.md +++ b/docs/source/user_guide/graph_mode.md @@ -47,14 +47,14 @@ from vllm import LLM os.environ["VLLM_USE_V1"] = 1 -model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enable": True}}) +model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True}}) outputs = model.generate("Hello, how are you?") ``` online example: ```shell -vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enable": true}}' +vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enabled": true}}' ``` You can find more detail about additional config [here](./additional_config.md) From 6ba3c107b0efa22ea187866eb9d6dbb6745ac0d5 Mon Sep 17 00:00:00 2001 From: chenwaner <48718746+chenwaner@users.noreply.github.com> Date: Wed, 11 Jun 2025 14:09:28 +0800 Subject: [PATCH 13/28] Enable kvcache_nz for the decode process in torchair graph mode (#1098) What this PR does / why we need it? Enable kvcache_nz for the decode process in torchair graph mode, which reduces the time consumed by FA in long sequences. Does this PR introduce any user-facing change? If need to enable kvcache_nz, should set the additional_config.torchair_graph_config.enable_kv_nz=True How was this patch tested? 1. Tested in deepseek model: with batchsize 64 and seq_len 1k+3k, 61 layers FA total time improves 20.80ms -> 19.76ms 2. operator precision test: [aclnnFusedInferAttentionScoreV3_result.csv](https://github.com/user-attachments/files/20664138/aclnnFusedInferAttentionScoreV3_result.csv) 3. tpot test from @ttanzhiqiang, and curl one result is normal https://github.com/vllm-project/vllm-ascend/pull/1098#issuecomment-2948542159 https://github.com/vllm-project/vllm-ascend/pull/1098#issuecomment-2954496588 --------- Signed-off-by: chenwaner <861645847@qq.com> Signed-off-by: wangxiaoxin (A) --- docs/source/user_guide/additional_config.md | 4 +- vllm_ascend/ascend_config.py | 1 + vllm_ascend/attention/mla_v1.py | 138 +++++++++++++------- 3 files changed, 96 insertions(+), 47 deletions(-) diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index 76bac94d0..b1041bf17 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -45,6 +45,7 @@ The details of each config option are as follows: | `use_cached_graph` | bool | `False` | Whether to use cached graph | | `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache | | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty | +| `enable_kv_nz`| bool | `False` | Whether to enable kvcache NZ layout | **ascend_scheduler_config** @@ -65,7 +66,8 @@ A full example of additional configuration is as follows: "use_cached_graph": true, "graph_batch_sizes": [1, 2, 4, 8], "graph_batch_sizes_init": false, - "enable_multistream_moe": false + "enable_multistream_moe": false, + "enable_kv_nz": false }, "ascend_scheduler_config": { "enabled": true, diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 2f9e03dff..67c9842df 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -60,6 +60,7 @@ def __init__(self, torchair_graph_config): "enable_multistream_moe", False) self.enable_view_optimize = torchair_graph_config.get( "enable_view_optimize", True) + self.enable_kv_nz = torchair_graph_config.get("enable_kv_nz", False) if not isinstance(self.graph_batch_sizes, list): raise TypeError("graph_batch_sizes must be list[int]") diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index d9f20cc23..b17d65c04 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -556,6 +556,7 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz # Adapt torch air graph mode with spec decoding. speculative_config = get_current_vllm_config().speculative_config if speculative_config is not None: @@ -859,6 +860,7 @@ def exec_kv( kv = self.kv_a_proj_with_mqa(hidden_states)[0] # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) + cache_mode = "PA_NZ" if self.enable_kv_nz else "PA" k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache( kv, self.kv_a_layernorm.weight, @@ -868,7 +870,37 @@ def exec_kv( kv_cache[1], kv_cache[0], epsilon=self.kv_a_layernorm.variance_epsilon, - cache_mode="PA", + cache_mode=cache_mode, + ) + return k_pe, k_nope + + def exec_kv_prefill( + self, + hidden_states: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + kv_cache: Tuple, + slots: torch.Tensor, + ): + + B = hidden_states.shape[0] + N = self.num_kv_heads + S = 1 + kv = self.kv_a_proj_with_mqa(hidden_states)[0] + # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] + kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) + cache_mode = "PA_BLK_NZ" if self.enable_kv_nz else "PA" + _, _, k_pe, k_nope = torch_npu.npu_kv_rmsnorm_rope_cache( + kv, + self.kv_a_layernorm.weight, + cos, + sin, + slots.to(torch.int64), + kv_cache[1], + kv_cache[0], + epsilon=self.kv_a_layernorm.variance_epsilon, + cache_mode=cache_mode, + is_output_kv=True, ) return k_pe, k_nope @@ -906,34 +938,42 @@ def _forward_decode( # TorchAir's shape is [bs, num_heads_per_rank, q_seq_len, dim] if attn_metadata.attn_state == AscendAttentionState.SpecDecoding: assert num_tokens % self.spec_token_num == 0 - q_nope = (q_nope.view( - num_tokens // (self.spec_token_num + 1), - self.spec_token_num + 1, - self.num_heads, - -1, - ).transpose(1, 2).contiguous()) - q_pe = (q_pe.view( - num_tokens // (self.spec_token_num + 1), - self.spec_token_num + 1, - self.num_heads, - -1, - ).transpose(1, 2).contiguous()) + q_nope = q_nope.view(num_tokens // (self.spec_token_num + 1), + self.spec_token_num + 1, self.num_heads, + -1) + q_pe = q_pe.view(num_tokens // (self.spec_token_num + 1), + self.spec_token_num + 1, self.num_heads, -1) + if not self.enable_kv_nz: + q_nope = q_nope.transpose(1, 2).contiguous() + q_pe = q_pe.transpose(1, 2).contiguous() sparse_mode = 3 spec_attn_mask = attn_metadata.decode.attn_mask # type:ignore else: - q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1) - q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1) + if self.enable_kv_nz: + q_nope = q_nope.view(num_tokens, 1, self.num_heads, -1) + q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1) + else: + q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1) + q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1) sparse_mode = 0 spec_attn_mask = None # shape of knope/k_pe for npu graph mode should be: # [num_blocks, num_kv_heads, block_size, self.kv_lora_rank/self.qk_rope_head_dim] block_size = kv_c_and_k_pe_cache[0].shape[1] - k_nope = k_nope.view(-1, self.num_kv_heads, block_size, - self.kv_lora_rank) - k_pe = k_pe.view(-1, self.num_kv_heads, block_size, - self.qk_rope_head_dim) + if self.enable_kv_nz: + k_nope = k_nope.view(-1, self.num_kv_heads, + self.kv_lora_rank // 16, block_size, 16) + k_pe = k_pe.view(-1, self.num_kv_heads, + self.qk_rope_head_dim // 16, block_size, 16) + input_layout = "BSND" + else: + k_nope = k_nope.view(-1, self.num_kv_heads, block_size, + self.kv_lora_rank) + k_pe = k_pe.view(-1, self.num_kv_heads, block_size, + self.qk_rope_head_dim) + input_layout = "BNSD" - attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score( + attn_output, _ = torch_npu.npu_fused_infer_attention_score( q_nope, k_nope, k_nope, @@ -941,7 +981,7 @@ def _forward_decode( key_rope=k_pe, num_heads=self.num_heads, num_key_value_heads=self.num_kv_heads, - input_layout="BNSD", + input_layout=input_layout, atten_mask=spec_attn_mask, sparse_mode=sparse_mode, scale=self.scale, @@ -990,10 +1030,11 @@ def forward( ] num_actual_toks = attn_metadata.num_actual_tokens if k_pe is None and not self.running_in_graph: - kv_c, k_pe = self.kv_a_proj_with_mqa( - hidden_states_or_kv_c_normed)[0].split( - [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) + if not self.torchair_graph_enabled: + kv_c, k_pe = self.kv_a_proj_with_mqa( + hidden_states_or_kv_c_normed)[0].split( + [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) else: kv_c_normed = hidden_states_or_kv_c_normed assert attn_metadata.num_decodes is not None and \ @@ -1006,16 +1047,18 @@ def forward( # Inputs and outputs may be padded for CUDA graphs output_padded = output output = output[:num_actual_toks, ...] - kv_c_normed = kv_c_normed[:num_actual_toks, ...] - prefill_k_c_normed = kv_c_normed[num_decode_tokens:] + if not self.torchair_graph_enabled: + kv_c_normed = kv_c_normed[:num_actual_toks, ...] + prefill_k_c_normed = kv_c_normed[num_decode_tokens:] if not self.running_in_graph: hidden_states_or_q_c = hidden_states_or_q_c[:num_actual_toks, ...] - decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens] prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:] - k_pe = k_pe[:num_actual_toks, ...] - k_pe = k_pe.unsqueeze(1) - decode_k_pe = k_pe[:num_decode_tokens] - prefill_k_pe = k_pe[num_decode_tokens:] + if not self.torchair_graph_enabled: + decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens] + k_pe = k_pe[:num_actual_toks, ...] + k_pe = k_pe.unsqueeze(1) + decode_k_pe = k_pe[:num_decode_tokens] + prefill_k_pe = k_pe[num_decode_tokens:] else: decode_hs_or_q_c = hidden_states_or_q_c if has_decode: @@ -1052,22 +1095,25 @@ def forward( prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim] if self.torchair_graph_enabled: num_tokens = prefill_hs_or_q_c.shape[0] + seq_len = self.rotary_emb.max_position_embeddings + cos = self.rotary_emb.cos_cached[:seq_len].to( + dtype=prefill_q_pe.dtype) + sin = self.rotary_emb.sin_cached[:seq_len].to( + dtype=prefill_q_pe.dtype) + cos = cos[attn_metadata.prefill.input_positions] + sin = sin[attn_metadata.prefill.input_positions] + cos = cos[:, None, None, :] + sin = sin[:, None, None, :] + + prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin) + prefill_k_pe, prefill_k_nope = self.exec_kv_prefill( + hidden_states_or_kv_c_normed, cos, sin, kv_cache, + attn_metadata.slot_mapping) + + kv_c_normed = prefill_k_nope[:num_actual_toks, ...] + prefill_k_c_normed = prefill_k_nope[num_decode_tokens:] prefill_k_pe = prefill_k_pe.view(num_tokens, self.num_kv_heads, -1) - if self.rotary_emb.__class__.__name__ == 'RotaryEmbedding': - # NOTE: When scaling not specified - ori_q_pe_shape, ori_k_pe_shape = prefill_q_pe.shape, prefill_k_pe.shape - prefill_q_pe = prefill_q_pe.reshape(num_tokens, -1) - prefill_k_pe = prefill_k_pe.reshape(num_tokens, -1) - prefill_q_pe, prefill_k_pe = self.rotary_emb( - attn_metadata.prefill.input_positions, prefill_q_pe, - prefill_k_pe) - prefill_q_pe = prefill_q_pe.view(ori_q_pe_shape) - prefill_k_pe = prefill_k_pe.view(ori_k_pe_shape) - else: - prefill_q_pe, prefill_k_pe = self.rotary_emb( - attn_metadata.prefill.input_positions, prefill_q_pe, - prefill_k_pe) prefill_q = torch.cat([prefill_q_nope, prefill_q_pe], dim=-1) else: prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb( From b686540ee613c8556eaf3f1c84863ad900170908 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Wed, 11 Jun 2025 16:33:11 +0800 Subject: [PATCH 14/28] [CI] Upgrade vllm to 0.9.1 (#1165) 1. upgrade vllm to 0.9.1. 0.9.0 is not supported for main branch now. keep doc to 0.9.0 until we release the first 0.9.1 release. 2. disable V0 test for PR 3. move actionlint check to lint job Signed-off-by: wangxiyuan Signed-off-by: wangxiaoxin (A) --- .github/workflows/accuracy_test.yaml | 5 +- .github/workflows/actionlint.yml | 53 -------- .github/workflows/nightly_benchmarks.yaml | 2 +- .github/workflows/vllm_ascend_test.yaml | 13 +- .../workflows/vllm_ascend_test_long_term.yaml | 2 +- .github/workflows/vllm_ascend_test_pd.yaml | 2 +- Dockerfile | 2 +- Dockerfile.openEuler | 2 +- tests/singlecard/compile/test_simple.py | 32 ++--- tests/singlecard/test_scheduler.py | 61 +++------ vllm_ascend/compilation/piecewise_backend.py | 8 +- vllm_ascend/core/scheduler.py | 19 +-- vllm_ascend/patch/__init__.py | 15 +-- vllm_ascend/patch/platform/__init__.py | 4 +- .../platform/patch_0_9_0/patch_distributed.py | 116 ------------------ .../patch_0_9_1}/__init__.py | 0 vllm_ascend/patch/worker/__init__.py | 4 +- .../patch_0_9_1}/__init__.py | 1 - vllm_ascend/worker/model_runner_v1.py | 51 +++----- 19 files changed, 72 insertions(+), 320 deletions(-) delete mode 100644 .github/workflows/actionlint.yml delete mode 100644 vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py rename vllm_ascend/patch/{worker/patch_0_9_0 => platform/patch_0_9_1}/__init__.py (100%) rename vllm_ascend/patch/{platform/patch_0_9_0 => worker/patch_0_9_1}/__init__.py (90%) diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index d1a61230e..999fb6ad5 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -34,8 +34,7 @@ on: # Current supported vLLM versions options: - main - - v0.9.0.1 - - v0.9.0 + - v0.9.1 - v0.7.3 vllm-ascend-version: description: 'vllm-ascend version:' @@ -159,7 +158,7 @@ jobs: repository: vllm-project/vllm path: ./vllm-empty # Please also update this when bump matched version - ref: ${{ github.event.inputs.vllm-version || 'v0.9.0' }} + ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }} - name: Install vllm-project/vllm from source working-directory: ./vllm-empty diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml deleted file mode 100644 index 91cd9c412..000000000 --- a/.github/workflows/actionlint.yml +++ /dev/null @@ -1,53 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: Lint GitHub Actions workflows -on: - pull_request: - branches: - - 'main' - - '*-dev' - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run actionlint" - env: - SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086 - run: | - echo "::add-matcher::.github/workflows/matchers/actionlint.json" - tools/actionlint.sh -color diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 67f8f9096..4f00e21df 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -46,7 +46,7 @@ jobs: strategy: matrix: include: - - vllm_branch: v0.9.0 + - vllm_branch: v0.9.1 vllm_ascend_branch: main container: image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index aaaa3b753..325b46202 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -31,6 +31,9 @@ on: - '!benchmarks/**' - 'tools/mypy.sh' - 'mypy.ini' + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -85,6 +88,13 @@ jobs: repository: vllm-project/vllm path: vllm-empty + - name: Actionlint Check + env: + SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086 + run: | + echo "::add-matcher::.github/workflows/matchers/actionlint.json" + tools/actionlint.sh -color + - name: Install vllm-project/vllm from source working-directory: vllm-empty run: | @@ -103,7 +113,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [v0.9.0, v0.9.1] + vllm_version: [main, v0.9.1] concurrency: group: > ${{ @@ -191,6 +201,7 @@ jobs: fi - name: Run vllm-project/vllm-ascend test on V0 engine + if: ${{ github.event_name == 'schedule' }} env: VLLM_USE_V1: 0 run: | diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index 11144280c..4b5253c72 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -40,7 +40,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [v0.9.0, v0.9.1] + vllm_version: [main, v0.9.1] name: vLLM Ascend long term test runs-on: ${{ matrix.os }} container: diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml index 0de616d4f..84800c7e3 100644 --- a/.github/workflows/vllm_ascend_test_pd.yaml +++ b/.github/workflows/vllm_ascend_test_pd.yaml @@ -38,7 +38,7 @@ jobs: if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }} strategy: matrix: - vllm_verison: [v0.9.0, v0.9.1] + vllm_verison: [main, v0.9.1] name: vLLM Ascend prefilling decoding disaggregation test runs-on: linux-arm64-npu-static-8 diff --git a/Dockerfile b/Dockerfile index 1dfd10c86..952e77fe9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.0 +ARG VLLM_TAG=v0.9.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index ffd1174d2..2ff3d0b39 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.0 +ARG VLLM_TAG=v0.9.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/tests/singlecard/compile/test_simple.py b/tests/singlecard/compile/test_simple.py index 64d4cba67..70b89297a 100644 --- a/tests/singlecard/compile/test_simple.py +++ b/tests/singlecard/compile/test_simple.py @@ -14,8 +14,6 @@ set_current_vllm_config) from vllm.utils import direct_register_custom_op -from vllm_ascend.utils import vllm_version_is - global_counter = 0 # create a library to hold the custom op @@ -93,28 +91,14 @@ def test_simple_piecewise_compile(): model = SillyModel(vllm_config=vllm_config, prefix="") inputs = torch.randn(100).npu() - - if vllm_version_is("0.9.0"): - kwargs = { - "num_graphs_seen": 1, # one graph for the model - "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 - "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers - "num_backend_compilations": - 3, # num_piecewise_capturable_graphs_seen - "num_cudagraph_caputured": - 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - } - else: - kwargs = { - "num_graphs_seen": 1, # one graph for the model - "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 - "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers - "num_backend_compilations": - 3, # num_piecewise_capturable_graphs_seen - "num_cudagraph_captured": - 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - } - + kwargs = { + "num_graphs_seen": 1, # one graph for the model + "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 + "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers + "num_backend_compilations": 3, # num_piecewise_capturable_graphs_seen + "num_cudagraph_captured": + 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + } with compilation_counter.expect(kwargs): model(inputs) diff --git a/tests/singlecard/test_scheduler.py b/tests/singlecard/test_scheduler.py index d1c606278..8021f0306 100644 --- a/tests/singlecard/test_scheduler.py +++ b/tests/singlecard/test_scheduler.py @@ -31,7 +31,6 @@ from vllm.v1.structured_output import StructuredOutputManager from vllm_ascend.core.scheduler import AscendScheduler -from vllm_ascend.utils import vllm_version_is EOS_TOKEN_ID = 50256 @@ -87,27 +86,15 @@ def create_scheduler( vllm_config = VllmConfig(scheduler_config=scheduler_config, model_config=model_config, cache_config=cache_config) - - if vllm_version_is("0.9.0"): - kv_cache_config = KVCacheConfig( - num_blocks=10000, # A large number of blocks to hold all requests - tensors={}, - kv_cache_groups=[ - KVCacheGroupSpec(['layer'], - FullAttentionSpec(16, 1, 1, torch.float32, - False)) - ], - ) - else: - kv_cache_config = KVCacheConfig( - num_blocks=10000, # A large number of blocks to hold all requests - kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])], - kv_cache_groups=[ - KVCacheGroupSpec(['layer'], - FullAttentionSpec(16, 1, 1, torch.float32, - False, None)) - ], - ) + kv_cache_config = KVCacheConfig( + num_blocks=10000, # A large number of blocks to hold all requests + kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])], + kv_cache_groups=[ + KVCacheGroupSpec(['layer'], + FullAttentionSpec(16, 1, 1, torch.float32, False, + None)) + ], + ) cache_config.num_gpu_blocks = 10000 return AscendScheduler( vllm_config, @@ -135,27 +122,15 @@ def create_requests(num_requests: int, else: mm_position = None mm_inputs = None - if vllm_version_is("0.9.0"): - request = Request( - request_id=f"{i}", - prompt_token_ids=[i] * num_tokens, - sampling_params=sampling_params, - multi_modal_inputs=mm_inputs, - multi_modal_placeholders=mm_position, - multi_modal_hashes=None, - arrival_time=0, - eos_token_id=EOS_TOKEN_ID, - ) - else: - request = Request( - request_id=f"{i}", - prompt_token_ids=[i] * num_tokens, - sampling_params=sampling_params, - multi_modal_inputs=mm_inputs, - multi_modal_placeholders=mm_position, - multi_modal_hashes=None, - eos_token_id=EOS_TOKEN_ID, - ) + request = Request( + request_id=f"{i}", + prompt_token_ids=[i] * num_tokens, + sampling_params=sampling_params, + multi_modal_inputs=mm_inputs, + multi_modal_placeholders=mm_position, + multi_modal_hashes=None, + eos_token_id=EOS_TOKEN_ID, + ) requests.append(request) return requests diff --git a/vllm_ascend/compilation/piecewise_backend.py b/vllm_ascend/compilation/piecewise_backend.py index 95ce69393..c6a800b3d 100644 --- a/vllm_ascend/compilation/piecewise_backend.py +++ b/vllm_ascend/compilation/piecewise_backend.py @@ -31,8 +31,6 @@ from vllm.logger import logger from vllm.utils import weak_ref_tensors -from vllm_ascend.utils import vllm_version_is - @dataclasses.dataclass class ConcreteSizeEntry: @@ -206,11 +204,7 @@ def __call__(self, *args) -> Any: # to save memory entry.output = weak_ref_tensors(output) entry.aclgraph = aclgraph - - if vllm_version_is("0.9.0"): - compilation_counter.num_cudagraph_caputured += 1 - else: - compilation_counter.num_cudagraph_captured += 1 + compilation_counter.num_cudagraph_captured += 1 # important: we need to return the output, rather than # the weak ref of the output, so that pytorch can correctly diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index 42f5d9c69..05c663feb 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -29,8 +29,6 @@ from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from vllm_ascend.utils import vllm_version_is - class AscendScheduler(Scheduler): """This Scheduler extends vllm's original v1 scheduler @@ -129,12 +127,7 @@ def skip_cur_request(): continue assert num_new_tokens > 0 - - if vllm_version_is("0.9.0"): - blocks = computed_blocks.blocks - else: - blocks = computed_blocks.blocks[0] - + blocks = computed_blocks.blocks[0] watermark = getattr(self.scheduler_config, "watermark", 0.01) if not self._check_watermark_for_prefill(request, num_new_tokens, blocks, watermark): @@ -330,14 +323,8 @@ def _check_watermark_for_prefill(self, len(computed_blocks) * self.block_size) num_required_blocks = cdiv(num_new_tokens + num_computed_tokens, self.block_size) - - if vllm_version_is("0.9.0"): - req_blocks = self.kv_cache_manager.single_type_manager.req_to_blocks[ - request.request_id] - else: - req_blocks = self.kv_cache_manager.coordinator.get_blocks( - request.request_id) - + req_blocks = self.kv_cache_manager.coordinator.get_blocks( + request.request_id) num_new_blocks = (num_required_blocks - len(req_blocks) - len(computed_blocks)) num_evictable_computed_blocks = sum(1 for blk in computed_blocks diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 4be92c23a..3c24bfc70 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -24,9 +24,9 @@ # each worker's `__init__` function. # # Then in each kind of patch, there are three folders: -# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0. +# - patch_0_9_1: contains the patches applied when vllm version is 0.9.1. # - patch_main: contains the patches applied when vllm version is main branch. -# - patch_common: contains the patches applied in both 0.9.0 and main branch. +# - patch_common: contains the patches applied in both 0.9.1 and main branch. # # Once a new patch is added in vllm-ascend, please add the patch description into this file as well. # ---------------------------------------------------------------------------------- @@ -35,17 +35,6 @@ # -------------------------------- # * Platform Patch: # ================= -# ** File: platform/patch_0_9_0/patch_distributed.py** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.distributed.utils.stateless_init_torch_distributed_process_group()` -# Why: -# vllm distributed use gloo backend by default to initialize stateless process group, but we want to use hccl here -# How: -# Add hccl backend to the `stateless_init_torch_distributed_process_group` -# Related PR (if no, explain why): -# https://github.com/vllm-project/vllm/pull/18763 -# Future Plan: -# Remove this patch once vllm is upgraded to 0.9.1 # ** File: platform/patch_common/patch_distributed.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.distributed.parallel_state.destroy_model_parallel()` diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index e724fe531..4ec38e38d 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -17,8 +17,8 @@ from vllm_ascend.utils import vllm_version_is # Import specific patches for different versions -if vllm_version_is("0.9.0"): - from vllm_ascend.patch.platform import patch_0_9_0 # noqa: F401 +if vllm_version_is("0.9.1"): + from vllm_ascend.patch.platform import patch_0_9_1 # noqa: F401 from vllm_ascend.patch.platform import patch_common # noqa: F401 else: from vllm_ascend.patch.platform import patch_common # noqa: F401 diff --git a/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py b/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py deleted file mode 100644 index d468326bd..000000000 --- a/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py +++ /dev/null @@ -1,116 +0,0 @@ -import torch -from torch.distributed import ProcessGroup -from torch.distributed.distributed_c10d import (Backend, PrefixStore, - _get_default_timeout, - is_nccl_available) -from torch.distributed.rendezvous import rendezvous -from vllm.distributed import utils - - -def stateless_init_torch_distributed_process_group( - host: str, port: int, rank: int, world_size: int, - backend: str) -> ProcessGroup: - """ - A replacement for `torch.distributed.init_process_group` that does not - pollute the global state. The created ProcessGroup object can be used for - some operations such as `allreduce`, because it does not depend on the - global rank. However, some operations such as `broadcast` cannot be used - because it depends on the global rank. - - # TODO: ask for help from PyTorch team if we need the `broadcast` operation. - - This function is useful when we are not sure about the total number of - processes in the process group. For example, we may have process - 1, 2, ..., 8 who want to communicate, and process 9 might be the same - process as process 1, or it might be a different process; process 10 - might be the same process as process 5, or it might be a different process. - In this case, how can we reliably form a communication channel within - process 9 and 10, without affecting the communication channel within - process 1, 2, ..., 8? - - One possible solution is to figure out if process 9 and 10 are the same - as process 1 and 5 beforehand, and then form a communication channel - based on the information, adjusting the ranks and world_size etc. However, - figuring out the information is not always easy, and it will interfere - with the main communication channel. - - Our solution is to always form a communication channel with process 1, 2, - ..., 8, and then use this function to form another communication channel - with process 9 and 10. This way, regardless of whether process 9 and 10 - are the same as process 1 and 5, the main communication channel is - always formed with process 1, 2, ..., 8, and the additional communication - channel is formed with process 9 and 10. - """ - init_method = f"tcp://{host}:{port}" - backend = Backend(backend) # it is basically string - timeout = _get_default_timeout(backend) - - store, rank, world_size = next( - rendezvous(init_method, rank, world_size, timeout=timeout)) - store.set_timeout(timeout) - - group_rank = rank - group_size = world_size - - # Use a PrefixStore to avoid accidental overrides of keys used by - # different systems (e.g. RPC) in case the store is multi-tenant. - prefix_store = PrefixStore(init_method, store) - - # TODO(Yizhou): The reason we need to set options while vllm does not - # seems to be related to the version of PyTorch. In the latest version, - # there is no need to set options. While in the older version, 2.5.1 - # specifically, we need to set options. - options = ProcessGroup.Options(backend=backend) - pg: ProcessGroup = ProcessGroup( - prefix_store, - group_rank, - group_size, - options, - ) - if backend == "gloo": - from torch.distributed.distributed_c10d import ProcessGroupGloo - backend_class = ProcessGroupGloo(prefix_store, - group_rank, - group_size, - timeout=timeout) - backend_type = ProcessGroup.BackendType.GLOO - device = torch.device("cpu") - elif backend == "nccl": - assert is_nccl_available() - from torch.distributed.distributed_c10d import ProcessGroupNCCL - - backend_options = ProcessGroupNCCL.Options() - backend_options._timeout = timeout - - backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size, - backend_options) - backend_type = ProcessGroup.BackendType.NCCL - device = torch.device("cuda") - elif backend == "hccl": - from torch.distributed import is_hccl_available - assert is_hccl_available() - from torch_npu._C._distributed_c10d import ProcessGroupHCCL - backend_options = ProcessGroupHCCL.Options() - backend_options._timeout = timeout - backend_class = ProcessGroupHCCL(prefix_store, group_rank, group_size, - backend_options) - device = torch.device("npu") - backend_class._set_sequence_number_for_group() - backend_type = ProcessGroup.BackendType.CUSTOM - pg._register_backend(device, backend_type, backend_class) - return pg - else: - raise RuntimeError(f"Unsupported torch distributed backend: {backend}") - - # TODO(Yizhou): Like we mentioned above, _set_default_backend is not - # implemented in the 2.5.1 version of PyTorch. But we need to set it - # after the latest version is released. - # pg._set_default_backend(backend_type) - backend_class._set_sequence_number_for_group() - - pg._register_backend(device, backend_type, backend_class) - - return pg - - -utils.stateless_init_torch_distributed_process_group = stateless_init_torch_distributed_process_group diff --git a/vllm_ascend/patch/worker/patch_0_9_0/__init__.py b/vllm_ascend/patch/platform/patch_0_9_1/__init__.py similarity index 100% rename from vllm_ascend/patch/worker/patch_0_9_0/__init__.py rename to vllm_ascend/patch/platform/patch_0_9_1/__init__.py diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index d1d3d42f4..3b29856d2 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -18,8 +18,8 @@ from vllm_ascend.utils import vllm_version_is # Import specific patches for different versions -if vllm_version_is("0.9.0"): - from vllm_ascend.patch.worker import patch_0_9_0 # noqa: F401 +if vllm_version_is("0.9.1"): + from vllm_ascend.patch.worker import patch_0_9_1 # noqa: F401 from vllm_ascend.patch.worker import patch_common # noqa: F401 else: from vllm_ascend.patch.worker import patch_common # noqa: F401 diff --git a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py b/vllm_ascend/patch/worker/patch_0_9_1/__init__.py similarity index 90% rename from vllm_ascend/patch/platform/patch_0_9_0/__init__.py rename to vllm_ascend/patch/worker/patch_0_9_1/__init__.py index f0ac16236..116c73c06 100644 --- a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py +++ b/vllm_ascend/patch/worker/patch_0_9_1/__init__.py @@ -14,4 +14,3 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import vllm_ascend.patch.platform.patch_0_9_0.patch_distributed # noqa diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 68f476f4c..8dcad248c 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -68,7 +68,7 @@ from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata from vllm_ascend.platform import NPUPlatform from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler -from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is +from vllm_ascend.utils import ProfileExecuteDuration from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer if TYPE_CHECKING: @@ -1360,44 +1360,27 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: import torch_npu kv_caches: Dict[str, torch.Tensor] = {} - # Remove this after we drop 0.9.0 support - if vllm_version_is("0.9.0"): - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.model_config.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=True, - vocab_size=self.model_config.get_vocab_size(), - block_size=self.cache_config.block_size, - ) - else: - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.model_config.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=True, - vocab_size=self.model_config.get_vocab_size(), - block_sizes=[self.cache_config.block_size], - ) + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.model_config.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=True, + vocab_size=self.model_config.get_vocab_size(), + block_sizes=[self.cache_config.block_size], + ) - if not vllm_version_is("0.9.0"): - kv_cache_sizes = {} - for kv_cache_tensor in kv_cache_config.kv_cache_tensors: - assert len(kv_cache_tensor.shared_by) == 1, ( - "KV cache tensor shared by multiple layers is not supported in " - "NPU.") - kv_cache_sizes[ - kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size + kv_cache_sizes = {} + for kv_cache_tensor in kv_cache_config.kv_cache_tensors: + assert len(kv_cache_tensor.shared_by) == 1, ( + "KV cache tensor shared by multiple layers is not supported in " + "NPU.") + kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size for kv_cache_group in kv_cache_config.kv_cache_groups: kv_cache_spec = kv_cache_group.kv_cache_spec for layer_name in kv_cache_group.layer_names: - if vllm_version_is("0.9.0"): - tensor_size = kv_cache_config.tensors[layer_name].size - else: - tensor_size = kv_cache_sizes[layer_name] + tensor_size = kv_cache_sizes[layer_name] assert tensor_size % kv_cache_spec.page_size_bytes == 0 num_blocks = tensor_size // kv_cache_spec.page_size_bytes From a265a4fd63232408b9c42415a99499de71fad046 Mon Sep 17 00:00:00 2001 From: whx <56632993+whx-sjtu@users.noreply.github.com> Date: Wed, 11 Jun 2025 20:55:44 +0800 Subject: [PATCH 15/28] [Scheduler][MTP] Add support for speculative decoding in AsecendScheduler. (#943) This PR adds support for speculative decoding in AsecendScheduler. Also inculde part of support for disaggregated prefill, full support will be merged in follow-up PR. --------- Signed-off-by: whx-sjtu <2952154980@qq.com> Signed-off-by: wangxiaoxin (A) --- .github/workflows/vllm_ascend_test.yaml | 15 +- tests/singlecard/core/__init__.py | 0 .../singlecard/core/test_ascend_scheduler.py | 792 ++++++++++++++++++ .../core/test_ascend_scheduler_e2e.py | 40 + vllm_ascend/core/scheduler.py | 205 ++++- 5 files changed, 1002 insertions(+), 50 deletions(-) create mode 100644 tests/singlecard/core/__init__.py create mode 100644 tests/singlecard/core/test_ascend_scheduler.py create mode 100644 tests/singlecard/core/test_ascend_scheduler_e2e.py diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 325b46202..54a3249e4 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -178,18 +178,20 @@ jobs: run: | if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py - pytest -sv tests/singlecard/test_scheduler.py # guided decoding doesn't work, fix it later # pytest -sv tests/singlecard/test_guided_decoding.py.py # test_ascend_config.py should be ran separately because it will regenerate the global config many times. pytest -sv tests/singlecard/test_ascend_config.py pytest -sv tests/singlecard/test_camem.py + # pytest -sv tests/singlecard/core/test_ascend_scheduler.py + # pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py pytest -sv tests/singlecard/ \ --ignore=tests/singlecard/test_offline_inference.py \ - --ignore=tests/singlecard/test_scheduler.py \ --ignore=tests/singlecard/test_guided_decoding.py \ --ignore=tests/singlecard/test_ascend_config.py \ - --ignore=tests/singlecard/test_camem.py + --ignore=tests/singlecard/test_camem.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py else pytest -sv tests/multicard/test_ilama_lora_tp2.py # To avoid oom, we need to run the test in a single process. @@ -207,20 +209,21 @@ jobs: run: | if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py - pytest -sv tests/singlecard/test_scheduler.py # guided decoding doesn't work, fix it later # pytest -sv tests/singlecard/test_guided_decoding.py.py pytest -sv tests/singlecard/test_camem.py # test_ascend_config.py should be ran separately because it will regenerate the global config many times. pytest -sv tests/singlecard/test_ascend_config.py pytest -sv tests/singlecard/test_prompt_embedding.py + pytest -sv tests/singlecard/core/test_ascend_scheduler.py pytest -sv tests/singlecard/ \ --ignore=tests/singlecard/test_offline_inference.py \ - --ignore=tests/singlecard/test_scheduler.py \ --ignore=tests/singlecard/test_guided_decoding.py \ --ignore=tests/singlecard/test_camem.py \ --ignore=tests/singlecard/test_ascend_config.py \ - --ignore=tests/singlecard/test_prompt_embedding.py + --ignore=tests/singlecard/test_prompt_embedding.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py else pytest -sv tests/multicard/test_ilama_lora_tp2.py # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error. diff --git a/tests/singlecard/core/__init__.py b/tests/singlecard/core/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/singlecard/core/test_ascend_scheduler.py b/tests/singlecard/core/test_ascend_scheduler.py new file mode 100644 index 000000000..c382ebdf4 --- /dev/null +++ b/tests/singlecard/core/test_ascend_scheduler.py @@ -0,0 +1,792 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import pytest +import torch +from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, + SchedulerConfig, SpeculativeConfig, VllmConfig) +from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange +from vllm.sampling_params import SamplingParams +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec) +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.request import Request, RequestStatus +from vllm.v1.structured_output import StructuredOutputManager + +from vllm_ascend.core.scheduler import AscendScheduler +from vllm_ascend.utils import vllm_version_is + +EOS_TOKEN_ID = 50256 + + +def create_scheduler( + model: str = "Qwen/Qwen2.5-0.5B-Instruct", + max_num_seqs: int = 16, + max_num_batched_tokens: int = 8192, + enable_prefix_caching: Optional[bool] = None, + long_prefill_token_threshold: int = 0, + disable_chunked_mm_input: bool = False, + use_kv_connector: bool = False, + num_blocks: int = 10000, + block_size: int = 16, + max_model_len: Optional[int] = None, + num_speculative_tokens: Optional[int] = None, + enable_chunked_prefill: bool = False, +) -> AscendScheduler: + '''Create scheduler under test. + + Args: + model: model under test + max_num_seqs: max sequences to schedule + max_num_batch_tokens: max num tokens to batch + enable_prefix_caching: optionally force APC config + (True/False) or use default + (None) + + Returns: + {class}`Scheduler` instance + ''' + if max_model_len is None: + max_model_len = max_num_batched_tokens + scheduler_config = SchedulerConfig( + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + max_model_len=max_model_len, + long_prefill_token_threshold=long_prefill_token_threshold, + disable_chunked_mm_input=disable_chunked_mm_input, + enable_chunked_prefill=enable_chunked_prefill, + ) + model_config = ModelConfig( + model=model, + task="auto", + tokenizer=model, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="float16", + seed=42, + ) + # Cache config, optionally force APC + kwargs_cache = ({} if enable_prefix_caching is None else { + 'enable_prefix_caching': enable_prefix_caching + }) + cache_config = CacheConfig( + block_size=block_size, + gpu_memory_utilization=0.9, + swap_space=0, + cache_dtype="auto", + **kwargs_cache, + ) + kv_transfer_config = KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={"shared_storage_path": "local_storage"}, + ) if use_kv_connector else None + + speculative_config: Optional[SpeculativeConfig] = None + if num_speculative_tokens is not None: + speculative_config = SpeculativeConfig( + model="ngram", num_speculative_tokens=num_speculative_tokens) + + vllm_config = VllmConfig( + scheduler_config=scheduler_config, + model_config=model_config, + cache_config=cache_config, + kv_transfer_config=kv_transfer_config, + speculative_config=speculative_config, + ) + kv_cache_config = KVCacheConfig( + num_blocks=num_blocks, # A large number of blocks to hold all requests + **({ + "tensors": {} + } if vllm_version_is("0.9.0") else { + "kv_cache_tensors": [] + }), + kv_cache_groups=[ + KVCacheGroupSpec(['layer'], + FullAttentionSpec(block_size, 1, 1, torch.float32, + False)) + ], + ) + cache_config.num_gpu_blocks = num_blocks + return AscendScheduler( + vllm_config=vllm_config, + kv_cache_config=kv_cache_config, + log_stats=True, + structured_output_manager=StructuredOutputManager(vllm_config), + ) + + +def create_requests(num_requests: int, + num_tokens: int = 10, + mm_positions: Optional[list[PlaceholderRange]] = None, + max_tokens: int = 16, + stop_token_ids: Optional[list[int]] = None, + prompt_logprobs: Optional[int] = None): + sampling_params = SamplingParams(ignore_eos=False, + max_tokens=max_tokens, + stop_token_ids=stop_token_ids, + prompt_logprobs=prompt_logprobs) + requests = [] + for i in range(num_requests): + if mm_positions is not None: + mm_position = mm_positions[i] + mm_inputs = [MultiModalKwargs({})] * len(mm_position) + else: + mm_position = None + mm_inputs = None + request = Request( + request_id=f"{i}", + prompt_token_ids=[i] * num_tokens, + sampling_params=sampling_params, + multi_modal_inputs=mm_inputs, + multi_modal_placeholders=mm_position, + multi_modal_hashes=None, + eos_token_id=EOS_TOKEN_ID, + **({ + "arrival_time": 0.0 + } if vllm_version_is("0.9.0") else {}), + ) + requests.append(request) + return requests + + +def test_add_requests(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + + for i, request in enumerate(requests): + scheduler.add_request(request) + assert request.request_id in scheduler.requests + assert len(scheduler.waiting) == i + 1 + + +def test_finish_request(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + for i, request in enumerate(requests): + scheduler.finish_requests(request.request_id, + RequestStatus.FINISHED_ABORTED) + assert request.request_id not in scheduler.requests + assert len(scheduler.waiting) == 9 - i + + +def test_get_num_unfinished_requests(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + for i, request in enumerate(requests): + scheduler.finish_requests(request.request_id, + RequestStatus.FINISHED_STOPPED) + assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1 + + +@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [ + (None, None), + (True, 5), +]) +def test_schedule(enable_prefix_caching: Optional[bool], + prompt_logprobs: Optional[int]): + '''Test scheduling. + Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs + ''' + scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching) + requests = create_requests(num_requests=10, + prompt_logprobs=prompt_logprobs) + for request in requests: + scheduler.add_request(request) + + # Test initial scheduling + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == len(requests) + assert len(output.scheduled_cached_reqs) == 0 + assert len(output.finished_req_ids) == 0 + # Verify all requests are scheduled. + for req_id, num_tokens in output.num_scheduled_tokens.items(): + assert num_tokens == len(requests[int(req_id)].prompt_token_ids) + + # Verify requests moved from waiting to running + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == len(requests) + for i, request in enumerate(requests): + assert scheduler.running[i] == request + + +@pytest.mark.parametrize("enable_prefix_caching", [True, False]) +def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): + """Test scheduling behavior with concurrent partial requests. + + This test verifies that: there are multiple long prefill requests in the + RUNNING state, and we can schedule them together. + + """ + scheduler = create_scheduler( + model="facebook/opt-125m", + max_num_batched_tokens=1024, + long_prefill_token_threshold=400, + enable_prefix_caching=enable_prefix_caching, + enable_chunked_prefill=True, + ) + requests = create_requests( + num_requests=3, + num_tokens=800, + ) + for request in requests: + scheduler.add_request(request) + + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == 3 + assert len(output.scheduled_cached_reqs) == 0 + assert len(output.finished_req_ids) == 0 + + # The first request is scheduled partially - 400. + assert output.num_scheduled_tokens[requests[0].request_id] == 400 + # The second request is scheduled partially - 400. + assert output.num_scheduled_tokens[requests[1].request_id] == 400 + # The third request is also scheduled partially - 1024 - 400 - 400 = 224. + assert output.num_scheduled_tokens[requests[2].request_id] == 224 + req_to_index = { + request.request_id: i + for i, request in enumerate(requests) + } + model_runner_output = ModelRunnerOutput( + req_ids=[request.request_id for request in requests], + req_id_to_index=req_to_index, + sampled_token_ids=[[] for _ in range(len(requests))], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + scheduler.update_from_output(output, model_runner_output) + + # Schedule the next step. All three requests are running. + # Processed the remaining prefills of the first and second requests. + output1 = scheduler.schedule() + assert len(scheduler.running) == 3 + assert len(output1.scheduled_new_reqs) == 0 + assert len(output1.scheduled_cached_reqs) == 3 + assert len(output1.finished_req_ids) == 0 + assert output1.num_scheduled_tokens[requests[0].request_id] == 400 + assert output1.num_scheduled_tokens[requests[1].request_id] == 400 + assert output1.num_scheduled_tokens[requests[2].request_id] == 224 + + # Schedule the third step. All three requests are running. + # First and second requests are in the decode stage. + # All the remaining tokens in the third request are processed. + model_runner_output = ModelRunnerOutput( + req_ids=[request.request_id for request in requests], + req_id_to_index=req_to_index, + sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + scheduler.update_from_output(output1, model_runner_output) + output2 = scheduler.schedule() + assert len(scheduler.running) == 3 + assert len(output2.scheduled_new_reqs) == 0 + assert len(output2.scheduled_cached_reqs) == 3 + assert len(output2.finished_req_ids) == 0 + assert output2.num_scheduled_tokens[requests[0].request_id] == 1 + assert output2.num_scheduled_tokens[requests[1].request_id] == 1 + assert output2.num_scheduled_tokens[ + requests[2].request_id] == 800 - 224 - 224 + + +def test_stop_via_update_from_output(): + """Test stopping behavior through update_from_output""" + scheduler = create_scheduler(num_speculative_tokens=1) + + # Test case 1: Stop on EOS token + requests = create_requests(num_requests=2, max_tokens=10) + for req in requests: + req.num_computed_tokens = req.num_tokens + scheduler.requests[req.request_id] = req + scheduler.running.append(req) + + scheduler_output = SchedulerOutput(scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 1, + requests[1].request_id: 2 + }, + total_num_scheduled_tokens=3, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [], + requests[1].request_id: [10] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={req.request_id: i + for i, req in enumerate(requests)}, + sampled_token_ids=[[EOS_TOKEN_ID], + [10, + 11]], # First request hits EOS, second continues + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify first request stopped, second continues + assert len(scheduler.running) == 1 + assert scheduler.running[0].request_id == requests[1].request_id + assert requests[0].status == RequestStatus.FINISHED_STOPPED + assert requests[0].request_id in scheduler.finished_req_ids + assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID] + assert list(requests[1].output_token_ids) == [10, 11] + + # Test case 2: Stop on custom stop token + scheduler = create_scheduler(num_speculative_tokens=2) + requests = create_requests(num_requests=2, + max_tokens=10, + stop_token_ids=[42, 43]) + for req in requests: + req.num_computed_tokens = req.num_tokens + scheduler.requests[req.request_id] = req + scheduler.running.append(req) + + scheduler_output = SchedulerOutput(scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 3, + requests[1].request_id: 2 + }, + total_num_scheduled_tokens=5, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [10, 42], + requests[1].request_id: [13] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={req.request_id: i + for i, req in enumerate(requests)}, + sampled_token_ids=[[10, 42, 12], + [13, 14]], # First request hits stop token + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify first request stopped on custom token + assert len(scheduler.running) == 1 + assert scheduler.running[0].request_id == requests[1].request_id + assert requests[0].status == RequestStatus.FINISHED_STOPPED + assert requests[0].stop_reason == 42 + assert requests[0].request_id in scheduler.finished_req_ids + assert list(requests[0].output_token_ids) == [10, 42] + assert list(requests[1].output_token_ids) == [13, 14] + + # Test case 3: Stop on max tokens + scheduler = create_scheduler(num_speculative_tokens=2) + requests = create_requests(num_requests=2, max_tokens=2) + for req in requests: + req.num_computed_tokens = req.num_tokens + scheduler.requests[req.request_id] = req + scheduler.running.append(req) + + scheduler_output = SchedulerOutput(scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 3, + requests[1].request_id: 1 + }, + total_num_scheduled_tokens=4, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [10, 11], + requests[1].request_id: [] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={req.request_id: i + for i, req in enumerate(requests)}, + sampled_token_ids=[[10, 11, 12], + [13]], # First request exceeds max_tokens + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify first request stopped due to length + assert len(scheduler.running) == 1 + assert scheduler.running[0].request_id == requests[1].request_id + assert requests[0].status == RequestStatus.FINISHED_LENGTH_CAPPED + assert requests[0].request_id in scheduler.finished_req_ids + assert list(requests[0].output_token_ids) == [10, 11 + ] # Truncated to max_tokens + assert list(requests[1].output_token_ids) == [13] + + # Test case 4: Ignore EOS flag + scheduler = create_scheduler(num_speculative_tokens=2) + requests = create_requests(num_requests=1, max_tokens=10) + requests[0].sampling_params.ignore_eos = True + requests[0].num_computed_tokens = requests[0].num_tokens + scheduler.requests[requests[0].request_id] = requests[0] + scheduler.running.append(requests[0]) + + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={requests[0].request_id: 3}, + total_num_scheduled_tokens=3, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [EOS_TOKEN_ID, 10] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + + model_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify request continues past EOS + assert len(scheduler.running) == 1 + assert not requests[0].is_finished() + assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11] + + +@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [ + (None, None), + (True, 5), +]) +def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool], + prompt_logprobs: Optional[int]): + scheduler = create_scheduler( + max_num_batched_tokens=1024, + max_num_seqs=2, + enable_prefix_caching=enable_prefix_caching, + enable_chunked_prefill=True, + ) + requests = create_requests( + num_requests=2, + num_tokens=512, + prompt_logprobs=prompt_logprobs, + ) + + # Schedule the first request. + scheduler.add_request(requests[0]) + scheduler_output0 = scheduler.schedule() + assert len(scheduler_output0.scheduled_new_reqs) == 1 + assert scheduler_output0.num_scheduled_tokens[ + requests[0].request_id] == 512 + + # The first request is still running, so only schedule the second request. + scheduler.add_request(requests[1]) + scheduler_output1 = scheduler.schedule() + assert len(scheduler_output1.scheduled_new_reqs) == 1 + assert scheduler_output1.num_scheduled_tokens[ + requests[1].request_id] == 512 + + # Model output of the first request. + model_runner_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[0]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + scheduler.update_from_output(scheduler_output0, model_runner_output) + + # Schedule the next step. + # The first request can be scheduled again while the second + # request is still running. + scheduler_output2 = scheduler.schedule() + assert scheduler_output2.num_scheduled_tokens[requests[0].request_id] == 1 + + # Model output of the second request. + model_runner_output = ModelRunnerOutput( + req_ids=[requests[1].request_id], + req_id_to_index={requests[1].request_id: 0}, + sampled_token_ids=[[0]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + scheduler.update_from_output(scheduler_output1, model_runner_output) + + +# Note - these test cases mirror some of those in test_rejection_sampler.py +@pytest.mark.parametrize( + "spec_tokens,output_tokens,expected", + [ + ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])), # perfect match + ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])), # early mismatch + ([[1, 2], [3]], [[1, 2, 5], [3, 4]], + (2, 3, 3, [2, 1])), # multiple sequences + ([[1]], [[1, 2]], (1, 1, 1, [1])), # single token sequence + ([[]], [[5]], (0, 0, 0, [0])), # empty sequence + ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]], + (2, 6, 3, [2, 1, 0])), # multiple mismatches + ]) +def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): + """Test scheduling behavior with speculative decoding. + + This test verifies that: + 1. Speculated tokens get scheduled correctly + 2. Spec decoding stats properly count number of draft and accepted tokens + """ + if vllm_version_is("0.9.0"): + return + num_spec_tokens = max(1, max(len(t) for t in spec_tokens)) + scheduler = create_scheduler(num_speculative_tokens=num_spec_tokens) + requests = create_requests(num_requests=len(spec_tokens), num_tokens=1) + req_ids = [] + req_to_index = {} + for i, request in enumerate(requests): + scheduler.add_request(request) + req_ids.append(request.request_id) + req_to_index[request.request_id] = i + + # Schedule a decode, which will also draft speculative tokens + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == len(requests) + assert output.total_num_scheduled_tokens == len(requests) + for i in range(len(requests)): + req_id = requests[i].request_id + assert output.num_scheduled_tokens[req_id] == 1 + assert req_id not in output.scheduled_spec_decode_tokens + + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[0] for _ in range(len(requests))], + spec_token_ids=spec_tokens, + logprobs=None, + prompt_logprobs_dict={}, + ) + engine_core_outputs = scheduler.update_from_output(output, + model_runner_output) + + for i in range(len(requests)): + running_req = scheduler.running[i] + # The prompt token + assert running_req.num_computed_tokens == 1 + # The prompt token and the sampled token + assert running_req.num_tokens == 2 + # The prompt token, the sampled token, and the speculated tokens + assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i]) + + # No draft or accepted tokens counted yet + assert not engine_core_outputs or ( + engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None) + + # Schedule the speculated tokens for validation + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == 0 + # The sampled token and speculated tokens + assert output.total_num_scheduled_tokens == \ + len(requests) + sum(len(ids) for ids in spec_tokens) + for i in range(len(requests)): + req_id = requests[i].request_id + assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i]) + if spec_tokens[i]: + assert len(output.scheduled_spec_decode_tokens[req_id]) == \ + len(spec_tokens[i]) + else: + assert req_id not in output.scheduled_spec_decode_tokens + + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=output_tokens, + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + engine_core_outputs = scheduler.update_from_output(output, + model_runner_output) + + scheduler_stats = engine_core_outputs[0].scheduler_stats \ + if engine_core_outputs else None + if expected[0] == 0: + assert scheduler_stats.spec_decoding_stats is None # type: ignore + else: + assert scheduler_stats.spec_decoding_stats is not None # type: ignore + stats = scheduler_stats.spec_decoding_stats # type: ignore + assert stats.num_drafts == expected[0] + assert stats.num_draft_tokens == expected[1] + assert stats.num_accepted_tokens == expected[2] + assert stats.num_accepted_tokens_per_pos == expected[3] + + +def _assert_right_scheduler_output( + output: SchedulerOutput, + num_requests: int, + expected_num_scheduled_tokens: int, +): + """Check if SchedulerOutput is correct after remote KV cache hit.""" + + # We should inject the kv_connector_metadata. + assert len(output.kv_connector_metadata.requests) == num_requests + + # Only num_tokens - matched_num_new_tokens should be scheduled. + for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): + assert num_scheduled_tokens == expected_num_scheduled_tokens + + +def _assert_right_kv_cache_manager( + scheduler: AscendScheduler, + req_ids: list[str], + num_tokens: int, + block_size: int, + num_requests: int, + num_total_blocks: int, +): + """Check whether KVCacheManager is correct after allocate.""" + + # Make sure the request stats are right. + EXPECTED_TOTAL_BLOCKS = num_tokens // block_size + for req_id in req_ids: + blocks = (scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks[req_id]) + hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id] + assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0]. + num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS) + assert len(blocks) == EXPECTED_TOTAL_BLOCKS + assert len(hashes) == EXPECTED_TOTAL_BLOCKS + + # Make sure we actually touched all the blocks. + BLOCKS_PER_REQ = num_tokens / block_size + assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == + num_total_blocks - num_requests * BLOCKS_PER_REQ) + + +def _step_until_done( + scheduler: AscendScheduler, + output: SchedulerOutput, + model_runner_output: ModelRunnerOutput, +): + """Loop over schedule(), update_from_output() until finished.""" + + all_finished = False + _ = scheduler.update_from_output(output, model_runner_output) + while not all_finished: + # Schedule + a few iterations until stopping. + output = scheduler.schedule() + assert len(scheduler.running) + for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): + # We should be in the decode phase now. + assert num_scheduled_tokens == 1 + assert len(output.kv_connector_metadata.requests) == 0 + ecos = scheduler.update_from_output(output, model_runner_output)[0] + all_done = True + for eco in ecos.outputs: + if eco.finish_reason is None: + all_done = False + all_finished = all_done + + +def make_output(scheduler: AscendScheduler): + return ModelRunnerOutput( + req_ids=[req.request_id for req in scheduler.running], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(scheduler.running) + }, + sampled_token_ids=[[1000]] * len(scheduler.running), + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + + +def assert_scheduler_empty(scheduler: AscendScheduler): + """Confirm the scheduler is "empty" - i.e. no leaks.""" + # Scheduler Metadata. + assert len(scheduler.requests) == 0 + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == 0 + assert len(scheduler.finished_req_ids) == 0 + assert len(scheduler._cached_reqs_data) == 0 + + # EncoderCacheManager. + assert len(scheduler.encoder_cache_manager.freed) == 0 + assert len(scheduler.encoder_cache_manager.cached) == 0 + + # KVCache Manager. + if not vllm_version_is("0.9.0"): + assert len(scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks) == 0 + assert len(scheduler.kv_cache_manager.coordinator. + single_type_managers[0].num_cached_block) == 0 + assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0 + num_free_blocks = ( + scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) + assert num_free_blocks == ( + scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1) + + # NOTE(rob): just the ref count on blocks will be 0. The hash + # value, etc will remain since we lazily evict for prefix cache. + for block in scheduler.kv_cache_manager.block_pool.blocks: + assert block.ref_cnt == 0 + + +def test_memory_leak(): + """Test that we do not have a memory leak.""" + + scheduler = create_scheduler(enable_prefix_caching=True) + + NUM_REQUESTS = 5 + NUM_TOKENS = 10 + MAX_TOKENS = 10 + requests = create_requests(num_requests=NUM_REQUESTS, + num_tokens=NUM_TOKENS, + max_tokens=MAX_TOKENS) + + # Add each request. + for request in requests: + scheduler.add_request(request) + scheduler_output = scheduler.schedule() + model_runner_output = make_output(scheduler) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Iterate until done. + while True: + scheduler_output = scheduler.schedule() + if len(scheduler.running) == 0: + break + model_runner_output = make_output(scheduler) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Confirm no memory leak. + assert_scheduler_empty(scheduler) \ No newline at end of file diff --git a/tests/singlecard/core/test_ascend_scheduler_e2e.py b/tests/singlecard/core/test_ascend_scheduler_e2e.py new file mode 100644 index 000000000..668dafced --- /dev/null +++ b/tests/singlecard/core/test_ascend_scheduler_e2e.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os + +import pytest +from vllm import LLM + +if os.getenv("VLLM_USE_V1", "0") != "1": + pytest.skip("Test package requires V1", allow_module_level=True) + +MODEL = "Qwen/Qwen2.5-0.5B-Instruct" +PROMPT = "Hello my name is Robert and I" + + +@pytest.fixture(scope="module") +def model() -> LLM: + return LLM( + MODEL, + enforce_eager=True, + enable_prefix_caching=True, + max_num_batched_tokens=200, + max_num_seqs=3, + additional_config={"ascend_scheduler_config": { + "enabled": True, + }}) + + +def test_concurrent_partial_prefill(model): + outputs = model.generate([PROMPT] * 3) + assert len(outputs) == 3 + for output in outputs: + assert len(output.outputs) == 1 + + +def test_prefix_cache_stats_is_recorded(model): + # 17 tokens will make sure first 16 tokens are cached in a block + input_tokens = {"prompt_token_ids": [101] * 129} + _ = model.generate([input_tokens]) + outputs = model.generate([input_tokens]) + assert outputs[0].num_cached_tokens == 128 \ No newline at end of file diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index 05c663feb..2fa31c264 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -14,16 +14,19 @@ # limitations under the License. # This file is a part of the vllm-ascend project. # +import time from collections import deque from typing import Iterable, Union from vllm.config import VllmConfig +from vllm.distributed.kv_events import KVEventBatch from vllm.logger import logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.utils import cdiv +from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler -from vllm.v1.engine import EngineCoreOutputs +from vllm.v1.engine import EngineCoreEventType, EngineCoreOutputs from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus @@ -49,11 +52,6 @@ def __init__( self.scheduled_req_ids: set[str] = set() self.running: list[Request] = [] - if self.vllm_config.kv_transfer_config is not None and \ - self.vllm_config.kv_transfer_config.is_kv_consumer: - raise ValueError( - "AscendScheduler cannot be used for decode nodes. ") - def schedule(self) -> SchedulerOutput: if self.scheduler_config.chunked_prefill_enabled: return super().schedule() @@ -68,6 +66,9 @@ def schedule(self) -> SchedulerOutput: # Spec decode-related. scheduled_spec_decode_tokens: dict[str, list[int]] = {} + # For logging. + scheduled_timestamp = time.monotonic() + # Record scheduled LoRA requests. scheduled_loras: set[int] = set() @@ -86,6 +87,18 @@ def skip_cur_request(): self.waiting.popleft() skipped_waiting_requests.appendleft(request) + num_prealloc_computed_tokens = 0 + # P/D: skip request if still waiting for remote kvs. + if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + is_ready = self._update_waiting_for_remote_kv(request) + if is_ready: + request.status = RequestStatus.WAITING + num_prealloc_computed_tokens = ( + request.num_computed_tokens) + else: + skip_cur_request() + continue + # Check that adding the request still respects the max_loras # constraint. if (self.lora_config and request.lora_request and @@ -95,39 +108,72 @@ def skip_cur_request(): skip_cur_request() continue - prompt_limit = self._get_prompt_limit(request) + num_external_computed_tokens = 0 + load_kv_async = False + # Get already-cached tokens. - computed_blocks, num_computed_tokens = ( - self.kv_cache_manager.get_computed_blocks(request)) - num_new_tokens = request.num_tokens - num_computed_tokens - if (0 < self.scheduler_config.long_prefill_token_threshold < - num_new_tokens): - num_new_tokens = ( - self.scheduler_config.long_prefill_token_threshold) - max_tokens_in_kvcache = (self.kv_cache_config.num_blocks * - self.block_size) - prompt_limit = min(prompt_limit, max_tokens_in_kvcache) - - # Finish request that exceeds prompt_limit or kv cache size. - if num_new_tokens > prompt_limit: - logger.warning( - "Input prompt (%d tokens) is too long" - " and exceeds limit of %d", - num_new_tokens, - prompt_limit, - ) - request.status = RequestStatus.FINISHED_IGNORED - self.finished_req_ids.add(request.request_id) # type: ignore - self.waiting.popleft() - continue + if num_prealloc_computed_tokens == 0: + new_computed_blocks, num_native_computed_tokens = \ + self.kv_cache_manager.get_computed_blocks( + request) + + # Get externally-cached tokens if using a KVConnector. + if self.connector is not None: + num_external_computed_tokens, load_kv_async = ( + self.connector.get_num_new_matched_tokens( + request, num_native_computed_tokens)) + + # Total computed tokens (local + external). + num_computed_tokens = (num_native_computed_tokens + + num_external_computed_tokens) + else: + # P/D: skip checking prefix cache if loaded from remote kvs. + new_computed_blocks = KVCacheBlocks.create_empty() + num_native_computed_tokens = 0 + + # Total computed tokens (allocated in prior step). + num_computed_tokens = num_prealloc_computed_tokens + + # P/D: loading remote KV, do not allocate for new work. + if load_kv_async: + assert num_external_computed_tokens > 0 + num_new_tokens = 0 + blocks = None + # Number of tokens to be scheduled. + else: + prompt_limit = self._get_prompt_limit(request) + # Get already-cached tokens. + computed_blocks, num_computed_tokens = ( + self.kv_cache_manager.get_computed_blocks(request)) + # We use `request.num_tokens` instead of + # `request.num_prompt_tokens` to consider the resumed + # requests, which have output tokens. + num_new_tokens = request.num_tokens - num_computed_tokens + max_tokens_in_kvcache = (self.kv_cache_config.num_blocks * + self.block_size) + prompt_limit = min(prompt_limit, max_tokens_in_kvcache) + + # Finish request that exceeds prompt_limit or kv cache size. + if num_new_tokens > prompt_limit: + logger.warning( + "Input prompt (%d tokens) is too long" + " and exceeds limit of %d", + num_new_tokens, + prompt_limit, + ) + request.status = RequestStatus.FINISHED_IGNORED + self.finished_req_ids.add( # type: ignore + request.request_id) # type: ignore + self.waiting.popleft() + continue - if num_new_tokens > token_budget: - # Scheduling would exceed token_budget, skip. - skip_cur_request() - continue + if num_new_tokens > token_budget: + # Scheduling would exceed token_budget, skip. + skip_cur_request() + continue + assert num_new_tokens > 0 + blocks = computed_blocks.blocks[0] - assert num_new_tokens > 0 - blocks = computed_blocks.blocks[0] watermark = getattr(self.scheduler_config, "watermark", 0.01) if not self._check_watermark_for_prefill(request, num_new_tokens, blocks, watermark): @@ -136,13 +182,38 @@ def skip_cur_request(): continue new_blocks = self.kv_cache_manager.allocate_slots( - request, num_new_tokens, new_computed_blocks=computed_blocks) + request, + num_new_tokens + num_external_computed_tokens, + num_native_computed_tokens, + new_computed_blocks=computed_blocks, + num_lookahead_tokens=self.num_lookahead_tokens, + delay_cache_blocks=load_kv_async) if new_blocks is None: # The request cannot be scheduled. break + # KVConnector: update internal state after allocation. + # This information is used to determine if a load is + # needed for this request. + if num_external_computed_tokens: + assert self.connector is not None + self.connector.update_state_after_alloc( + request, + new_computed_blocks + new_blocks, + num_external_computed_tokens, + ) + self.waiting.popleft() + if load_kv_async: + # If loading async, allocate memory and put request + # into the WAITING_FOR_REMOTE_KV state. + skipped_waiting_requests.appendleft(request) + request.status = RequestStatus.WAITING_FOR_REMOTE_KVS + continue self.running.append(request) + if self.log_stats: + request.record_event(EngineCoreEventType.SCHEDULED, + scheduled_timestamp) self.scheduled_req_ids.add(request.request_id) # Check request status. if request.status == RequestStatus.WAITING: @@ -161,6 +232,9 @@ def skip_cur_request(): token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens + # Count the number of prifix cached tokens. + if request.num_cached_tokens < 0: + request.num_cached_tokens = num_computed_tokens # Put back any skipped requests at the head of the waiting queue if skipped_waiting_requests: @@ -179,16 +253,45 @@ def skip_cur_request(): num_new_tokens = (request.num_tokens_with_spec - request.num_computed_tokens) - if (0 < self.scheduler_config.long_prefill_token_threshold < - num_new_tokens): - num_new_tokens = ( - self.scheduler_config.long_prefill_token_threshold) + assert (request.num_tokens - request.num_computed_tokens) == 1 num_new_tokens = min(num_new_tokens, token_budget) - assert num_new_tokens == 1 + # Make sure the input position does not exceed the max model len. + # This is necessary when using spec decoding. + num_new_tokens = min( + num_new_tokens, + self.max_model_len - request.num_computed_tokens) + # Check that adding the request still respects the max_loras + # constraint. + if self.lora_config and request.lora_request and ( + len(scheduled_loras) == self.lora_config.max_loras + and request.lora_request.lora_int_id + not in scheduled_loras): + # Scheduling would exceed max_loras, skip. + num_new_tokens = 0 + + if num_new_tokens == 0: + # The request cannot be scheduled because one of the following + # reason: + # 1. No new tokens to schedule. This may happen when PP>1 and + # we have already scheduled all prompt tokens but they are + # not finished yet. + # 2. Adding the request exceeds the max_loras constraint. + # NOTE(woosuk): Here, by doing `continue` instead of `break`, + # we do not strictly follow the FCFS scheduling policy and + # allow the lower-priority requests to be scheduled. + req_index += 1 + continue + + num_draft_tokens = max( + num_new_tokens + request.num_computed_tokens - + request.num_tokens, 0) while True: new_blocks = self.kv_cache_manager.allocate_slots( - request, num_new_tokens) + request, + num_new_tokens, + num_draft_tokens=num_draft_tokens, + num_lookahead_tokens=self.num_lookahead_tokens) if new_blocks is None: # The request cannot be scheduled. # Preempt the lowest-priority request. @@ -196,6 +299,10 @@ def skip_cur_request(): self.kv_cache_manager.free(preempted_req) preempted_req.status = RequestStatus.PREEMPTED preempted_req.num_computed_tokens = 0 + if self.log_stats: + preempted_req.record_event( + EngineCoreEventType.PREEMPTED, + scheduled_timestamp) self.waiting.appendleft(preempted_req) preempted_reqs.append(preempted_req) if preempted_req == request: @@ -230,6 +337,10 @@ def skip_cur_request(): scheduled_spec_decode_tokens[request.request_id] = ( request.spec_token_ids) + # Record scheduled LoRA requests. + if self.lora_config and request.lora_request: + scheduled_loras.add(request.lora_request.lora_int_id) + # Check if the scheduling constraints are satisfied. total_num_scheduled_tokens = sum(num_scheduled_tokens.values()) assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens @@ -297,6 +408,11 @@ def skip_cur_request(): meta = self.connector.build_connector_meta(scheduler_output) scheduler_output.kv_connector_metadata = meta + events = self.kv_cache_manager.take_events() + if events: + batch = KVEventBatch(ts=time.time(), events=events) + self.kv_event_publisher.publish(batch) + # Advance the number of computed tokens for the request AFTER # the request is scheduled. # 1. The scheduler_output of the current step has to include the @@ -388,7 +504,8 @@ def update_from_output( if num_tokens_scheduled == 0: # The request was not scheduled in this step. continue - self.scheduled_req_ids.remove(req_id) + if req_id in self.scheduled_req_ids: + self.scheduled_req_ids.remove(req_id) return super().update_from_output(scheduler_output, model_runner_output) From fcd5ad867f08101b1c2f805be030ce4f6a4e5672 Mon Sep 17 00:00:00 2001 From: ttanzhiqiang <38750855+ttanzhiqiang@users.noreply.github.com> Date: Thu, 12 Jun 2025 10:44:33 +0800 Subject: [PATCH 16/28] add custom ascendc kernel vocabparallelembedding (#796) This PR add custom ascendc kernel vocabparallelembedding support in vllm-ascend, related CMakeLists and setuptools is also added in this PR. pytest -s benchmarks/ops/ben_vocabparallelembedding.py pytest -s tests/ops/test_vocabparallelembedding.py --------- Signed-off-by: ttanzhiqiang <389825161@qq.com> Signed-off-by: wangxiaoxin (A) --- CMakeLists.txt | 2 - benchmarks/ops/ben_vocabparallelembedding.py | 144 ++++++++ .../get_masked_input_and_mask_kernel.cpp | 345 ++++++++++++++++++ csrc/ops.h | 14 + csrc/torch_binding.cpp | 116 ++++++ tests/ops/test_vocabparallelembedding.py | 91 +++++ 6 files changed, 710 insertions(+), 2 deletions(-) create mode 100644 benchmarks/ops/ben_vocabparallelembedding.py create mode 100644 csrc/kernels/get_masked_input_and_mask_kernel.cpp create mode 100644 tests/ops/test_vocabparallelembedding.py diff --git a/CMakeLists.txt b/CMakeLists.txt index a2c3ad2e1..8d06c75f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,5 +96,3 @@ target_link_libraries( target_link_options(vllm_ascend_C PRIVATE "-Wl,-rpath,$ORIGIN:$ORIGIN/lib") install(TARGETS vllm_ascend_C vllm_ascend_kernels DESTINATION ${VLLM_ASCEND_INSTALL_PATH}) - - diff --git a/benchmarks/ops/ben_vocabparallelembedding.py b/benchmarks/ops/ben_vocabparallelembedding.py new file mode 100644 index 000000000..e91cfed7b --- /dev/null +++ b/benchmarks/ops/ben_vocabparallelembedding.py @@ -0,0 +1,144 @@ +from typing import Tuple + +import numpy as np +import pytest +import torch +import torch_npu # noqa: F401 +import vllm # noqa: F401 + +import vllm_ascend.platform # noqa: F401 + + +def benchmark_npu(fn, num_iterations=100, num_warmup_iterations=50): + """ + Benchmark function for NPU operations + + Args: + fn: Function to benchmark + num_iterations: Number of timing iterations + num_warmup_iterations: Number of warmup iterations + + Returns: + float: Minimum elapsed time in seconds + """ + start = torch.npu.Event(enable_timing=True) + end = torch.npu.Event(enable_timing=True) + times = np.zeros(num_iterations + num_warmup_iterations) + + # Run iterations + for i in range(num_warmup_iterations + num_iterations): + with torch.no_grad(): + start.record() + fn() # Execute the function + end.record() + torch.npu.synchronize() + times[i] = start.elapsed_time(end) + + # Remove warmup iterations and convert to seconds + times = times[num_warmup_iterations:] + elapsed_time = np.amin(times) / 1000 + return elapsed_time + + +def get_masked_input_and_mask_ref( + input_: torch.Tensor, org_vocab_start_index: int, + org_vocab_end_index: int, num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Reference implementation for verification""" + org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < + org_vocab_end_index) + added_vocab_mask = (input_ >= added_vocab_start_index) & ( + input_ < added_vocab_end_index) + added_offset = added_vocab_start_index - ( + org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding + valid_offset = (org_vocab_start_index * + org_vocab_mask) + (added_offset * added_vocab_mask) + vocab_mask = org_vocab_mask | added_vocab_mask + masked_input = vocab_mask * (input_ - valid_offset) + return masked_input, ~vocab_mask + + +DTYPES = [torch.int32] +SHAPES = [(3, 4, 5)] +DEVICES = [f"npu:{0}"] +SEEDS = [0] + + +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEEDS) +@torch.inference_mode() +def test_get_masked_input_and_mask( + shape: Tuple[int, ...], + dtype: torch.dtype, + device: str, + seed: int, +) -> None: + # Set random seed and device + torch.manual_seed(seed) + torch.set_default_device(device) + + # Generate random input tensor + input_tensor = torch.randint(0, 1000, shape, dtype=dtype) + + # Test parameters + test_case = { + "org_start": 100, + "org_end": 200, + "padding": 0, + "added_start": 300, + "added_end": 400, + } + + # Define reference function + def ref_fn(): + return get_masked_input_and_mask_ref(input_tensor, + test_case["org_start"], + test_case["org_end"], + test_case["padding"], + test_case["added_start"], + test_case["added_end"]) + + # Define custom function + def custom_fn(): + return torch.ops._C.get_masked_input_and_mask(input_tensor, + test_case["org_start"], + test_case["org_end"], + test_case["padding"], + test_case["added_start"], + test_case["added_end"]) + + # Get results for correctness testing + ref_masked_input, ref_mask = ref_fn() + custom_masked_input, custom_mask = custom_fn() + + # Benchmark both implementations + ref_time = benchmark_npu(ref_fn) + custom_time = benchmark_npu(custom_fn) + + # Print performance results + print("\nPerformance Results:") + print(f"Reference implementation: {ref_time*1000:.3f} ms") + print(f"Custom implementation: {custom_time*1000:.3f} ms") + print(f"Speedup: {ref_time/custom_time:.2f}x") + + # Compare results for correctness + ref_masked_input = ref_masked_input.to(dtype) + print("\nResults comparison:") + print("custom_masked_input:", custom_masked_input) + print("ref_masked_input:", ref_masked_input) + print("custom_mask:", custom_mask) + print("ref_mask:", ref_mask) + torch.testing.assert_close( + custom_masked_input, + ref_masked_input, + rtol=1e-5, + atol=1e-5, + msg=f"Masked input mismatch for case: {test_case}") + torch.testing.assert_close(custom_mask, + ref_mask, + rtol=1e-5, + atol=1e-5, + msg=f"Mask mismatch for case: {test_case}") diff --git a/csrc/kernels/get_masked_input_and_mask_kernel.cpp b/csrc/kernels/get_masked_input_and_mask_kernel.cpp new file mode 100644 index 000000000..47ce826f2 --- /dev/null +++ b/csrc/kernels/get_masked_input_and_mask_kernel.cpp @@ -0,0 +1,345 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + */ + +#include "kernel_operator.h" +#include "kernel_tensor_impl.h" +#include "kernel_type.h" +#include "types.h" +#include "utils.h" +using vllm_ascend::AccType; + +template +class GetMaskedInputAndMask { +public: + __aicore__ inline GetMaskedInputAndMask() {} + + __aicore__ inline ~GetMaskedInputAndMask() { + pipe.Reset(); + } + + + __aicore__ inline void Init( + __gm__ scalar_t* input, + __gm__ scalar_t* masked_input, + __gm__ bool* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size) + { + // Initialize basic parameters + input_ = input; + masked_input_ = masked_input; + mask_out_ = mask_out; + org_vocab_start_index_ = org_vocab_start_index; + org_vocab_end_index_ = org_vocab_end_index; + size_ = ((size + 31) / 32) * 32; + added_offset_ = added_vocab_start_index - + (org_vocab_end_index - org_vocab_start_index) - + num_org_vocab_padding; + added_vocab_start_index_ = added_vocab_start_index; + added_vocab_end_index_ = added_vocab_end_index; + + // Initialize global tensors + inputGlobal.SetGlobalBuffer(input); + maskedOutputGlobal.SetGlobalBuffer(masked_input); + maskOutGlobal.SetGlobalBuffer(mask_out); + + // Initialize queues + pipe.InitBuffer(inQueue, 1, size_ * sizeof(scalar_t)); + pipe.InitBuffer(outQueue, 1, size_ * sizeof(scalar_t)); + pipe.InitBuffer(maskQueue, 1, size_ * sizeof(bool)); + + // Initialize calculation buffers + pipe.InitBuffer(calc_buf_1, size_ * sizeof(float)); + pipe.InitBuffer(calc_buf_2, size_ * sizeof(float)); + + // Initialize result queues + pipe.InitBuffer(result_ge_que, BUFFER_NUM, size_ * sizeof(float)); + pipe.InitBuffer(result_le_que, BUFFER_NUM, size_ * sizeof(float)); + pipe.InitBuffer(result_org_mask_que, BUFFER_NUM, size_ * sizeof(float)); + pipe.InitBuffer(result_add_mask_que, BUFFER_NUM, size_ * sizeof(float)); + + // Initialize temporary buffers + pipe.InitBuffer(start_buf, size_ * sizeof(float)); + pipe.InitBuffer(end_buf, size_ * sizeof(float)); + pipe.InitBuffer(inputFloat_buf, size_ * sizeof(float)); + pipe.InitBuffer(validOffset_buf, size_ * sizeof(float)); + pipe.InitBuffer(vocabMask_buf_, size_ * sizeof(int8_t)); + pipe.InitBuffer(ones_buf_, size_ * sizeof(float)); + } + + __aicore__ inline void Process() + { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + __aicore__ inline void CopyIn() + { + AscendC::LocalTensor inputLocal = inQueue.AllocTensor(); + AscendC::DataCopy(inputLocal, inputGlobal, size_); + inQueue.EnQue(inputLocal); + } + + __aicore__ inline void CompareWithValue( + AscendC::LocalTensor& result, + const AscendC::LocalTensor& input, + const AscendC::LocalTensor& compare_value, + bool is_greater_equal) { + + AscendC::LocalTensor compute_buf = calc_buf_1.Get(); + if (is_greater_equal) { + AscendC::Max(compute_buf, input, compare_value, size_); + AscendC::Sub(compute_buf, compare_value, compute_buf, size_); + } else { + AscendC::Max(compute_buf, input, compare_value, size_); + AscendC::Sub(compute_buf, compute_buf, compare_value, size_); + } + + AscendC::Abs(compute_buf, compute_buf, size_); + AscendC::Mins(compute_buf, compute_buf, MIN_ACCURACY_FP32, size_); + AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_); + AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_); + AscendC::Muls(compute_buf, compute_buf, MAX_MUL_2_FP32, size_); + AscendC::Adds(compute_buf, compute_buf, NEGATIVE_ONE_FP32, size_); + AscendC::Abs(compute_buf, compute_buf, size_); + + AscendC::LocalTensor compute_buf_fp16 = calc_buf_2.Get(); + AscendC::Cast(compute_buf_fp16, compute_buf, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(result, compute_buf_fp16, AscendC::RoundMode::CAST_NONE, size_); + } + + __aicore__ inline void ComputeRangeMask( + AscendC::LocalTensor& range_mask, + const AscendC::LocalTensor& input, + const float start_value, + const float end_value) { + + // Use already initialized buffers + AscendC::LocalTensor start_value_tensor = start_buf.Get(); + AscendC::LocalTensor end_value_tensor = end_buf.Get(); + + AscendC::Duplicate(start_value_tensor, start_value, size_); + AscendC::Duplicate(end_value_tensor, end_value, size_); + + AscendC::LocalTensor ge_result = result_ge_que.AllocTensor(); + AscendC::LocalTensor lt_result = result_le_que.AllocTensor(); + + CompareWithValue(ge_result, start_value_tensor, input, true); + CompareWithValue(lt_result, input, end_value_tensor, false); + + AscendC::And(range_mask, ge_result, lt_result, size_); + } + + __aicore__ inline void Compute() { + AscendC::LocalTensor inputLocal = inQueue.DeQue(); + AscendC::LocalTensor maskedLocal = outQueue.AllocTensor(); + AscendC::LocalTensor maskLocal = maskQueue.AllocTensor(); + + AscendC::LocalTensor inputFloat = inputFloat_buf.Get(); + AscendC::Cast(inputFloat, inputLocal, AscendC::RoundMode::CAST_NONE, size_); + + // Calculate mask for org_vocab range + // org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) + AscendC::LocalTensor orgVocabMask = result_org_mask_que.AllocTensor(); + ComputeRangeMask(orgVocabMask, + inputFloat, + static_cast(org_vocab_start_index_), + static_cast(org_vocab_end_index_)); + + // Calculate mask for added_vocab range + // added_vocab_mask = (input_ >= added_vocab_start_index) & (input_ < added_vocab_end_index) + AscendC::LocalTensor addedVocabMask = result_add_mask_que.AllocTensor(); + ComputeRangeMask(addedVocabMask, + inputFloat, + static_cast(added_vocab_start_index_), + static_cast(added_vocab_end_index_)); + + // Calculate validOffset + // valid_offset = (org_vocab_start_index * org_vocab_mask) + (added_offset * added_vocab_mask) + AscendC::LocalTensor validOffset = validOffset_buf.Get(); + AscendC::LocalTensor constOrgStartIndex = start_buf.Get(); + + AscendC::Duplicate(constOrgStartIndex, float(org_vocab_start_index_), size_); + + AscendC::LocalTensor orgVocabMask_fp16; + AscendC::LocalTensor orgVocabMask_fp32; + AscendC::Cast(orgVocabMask_fp16, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(orgVocabMask_fp32, orgVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Mul(validOffset, + constOrgStartIndex, + orgVocabMask_fp32, + size_); + + AscendC::LocalTensor addedOffset; + AscendC::LocalTensor addedOffsetTensor = end_buf.Get(); + AscendC::Duplicate(addedOffsetTensor, float(added_offset_), size_); + + AscendC::LocalTensor addedVocabMask_fp16; + AscendC::LocalTensor addedVocabMask_fp32; + AscendC::Cast(addedVocabMask_fp16, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(addedVocabMask_fp32, addedVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Mul(addedOffset, + addedOffsetTensor, + addedVocabMask_fp32, + size_); + + AscendC::Add(validOffset, validOffset, addedOffset, size_); + + // vocab_mask = org_vocab_mask | added_vocab_mask + AscendC::LocalTensor vocabMask = vocabMask_buf_.Get(); + + AscendC::Or(vocabMask, + orgVocabMask, + addedVocabMask, + size_); + + AscendC::Sub(inputFloat, inputFloat, validOffset, size_); + + // input_ = vocab_mask * (input_ - valid_offset) + AscendC::LocalTensor vocabMask_fp16; + AscendC::LocalTensor vocabMask_fp32; + AscendC::Cast(vocabMask_fp16, vocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(vocabMask_fp32, vocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::LocalTensor inputFloat_fp32; + AscendC::Mul(inputFloat, inputFloat, vocabMask_fp32, size_); + + AscendC::Cast(maskedLocal, inputFloat, AscendC::RoundMode::CAST_CEIL, size_); + outQueue.EnQue(maskedLocal); + + // ~vocab_mask + AscendC::LocalTensor ones_tensor = ones_buf_.Get(); + AscendC::Duplicate(ones_tensor, (float)1, size_); + AscendC::LocalTensor maskLocal_fp32; + + AscendC::Sub(maskLocal_fp32, + ones_tensor, + vocabMask_fp32, + size_); + + AscendC::LocalTensor maskLocal_fp16; + AscendC::Cast(maskLocal_fp16, maskLocal_fp32, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(maskLocal, maskLocal_fp16, AscendC::RoundMode::CAST_NONE, size_); + maskQueue.EnQue(maskLocal); + inQueue.FreeTensor(inputLocal); + } + + __aicore__ inline void CopyOut() + { + AscendC::LocalTensor maskedLocal = outQueue.DeQue(); + AscendC::LocalTensor maskLocal = maskQueue.DeQue(); + + AscendC::DataCopy(maskedOutputGlobal, maskedLocal, size_); + AscendC::DataCopy(maskOutGlobal, maskLocal, size_); + + outQueue.FreeTensor(maskedLocal); + maskQueue.FreeTensor(maskLocal); + } + +private: + static constexpr int32_t BUFFER_NUM = 2; + AscendC::TPipe pipe; + AscendC::TQue inQueue; + AscendC::TQue outQueue, maskQueue; + AscendC::GlobalTensor inputGlobal, maskedOutputGlobal; + AscendC::GlobalTensor maskOutGlobal; + AscendC::TBuf calc_buf_1; + AscendC::TBuf calc_buf_2; + AscendC::TQue result_ge_que; + AscendC::TQue result_le_que; + AscendC::TQue result_org_mask_que; + AscendC::TQue result_add_mask_que; + + // Temporary buffers + AscendC::TBuf start_buf; + AscendC::TBuf end_buf; + + // Temporary buffers continued + AscendC::TBuf inputFloat_buf; + AscendC::TBuf validOffset_buf; + AscendC::TBuf vocabMask_buf_; + AscendC::TBuf ones_buf_; + + __gm__ scalar_t *input_, *masked_input_; + __gm__ bool *mask_out_; + int64_t size_; + int64_t org_vocab_start_index_, org_vocab_end_index_; + int64_t added_vocab_start_index_, added_vocab_end_index_; + int64_t added_offset_; + + static constexpr float MIN_ACCURACY_FP32 = 1.1754943508222875e-38; + static constexpr float MAX_MUL_1_FP32 = 1125899906842624; + static constexpr float MAX_MUL_2_FP32 = 67108864; + static constexpr float NEGATIVE_ONE_FP32 = -1.0f; +}; + +extern "C" __global__ __aicore__ void get_masked_input_and_mask_kernel( + __gm__ int32_t* input, + __gm__ int32_t* masked_input, + __gm__ bool* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size, + const uint32_t loop_cnt, + const uint32_t aiv_num) +{ + { + GetMaskedInputAndMask op{}; + + for (int64_t i = AscendC::GetBlockIdx(); i < loop_cnt; i += aiv_num) { + op.Init(input + i * size/loop_cnt, + masked_input + i * size/loop_cnt, + mask_out + i * size/loop_cnt, + org_vocab_start_index, org_vocab_end_index, + num_org_vocab_padding, added_vocab_start_index, + added_vocab_end_index, size/loop_cnt); + + op.Process(); + } + } // op destructor called here +} + +namespace vllm_ascend { + +void get_masked_input_and_mask_impl( + void* stream, + void* input, + void* masked_input, + void* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size, + const uint32_t loop_cnt, + const uint32_t aiv_num) +{ + get_masked_input_and_mask_kernel<<>>( + static_cast(input), + static_cast(masked_input), + static_cast(mask_out), + org_vocab_start_index, + org_vocab_end_index, + num_org_vocab_padding, + added_vocab_start_index, + added_vocab_end_index, + size, + loop_cnt, + aiv_num); +} + +} // namespace vllm_ascend + diff --git a/csrc/ops.h b/csrc/ops.h index b921b2bf9..b1bc6028e 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -31,6 +31,20 @@ namespace vllm_ascend { const int headSize, const int64_t numTokens, const uint32_t loopCnt, uint32_t aivNum); + extern void get_masked_input_and_mask_impl( + void* stream, + void* input, + void* masked_input, + void* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size, + const uint32_t loop_cnt, + const uint32_t aiv_num); + torch::Tensor weak_ref_tensor(torch::Tensor& tensor) { if (!tensor.is_privateuseone()) { throw std::runtime_error("Tensor must be on NPU device"); diff --git a/csrc/torch_binding.cpp b/csrc/torch_binding.cpp index c4154381d..001a9314d 100644 --- a/csrc/torch_binding.cpp +++ b/csrc/torch_binding.cpp @@ -99,6 +99,112 @@ std::tuple rotary_embedding(at::Tensor &positions, at::T return {query_dst, key_dst}; } +std::tuple get_masked_input_and_mask( + at::Tensor &input, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index) + /* + https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/vocab_parallel_embedding.py#L161-L198 + Embedding parallelized in the vocabulary dimension. + + Adapted from torch.nn.Embedding, note that we pad the vocabulary size to + make sure it is divisible by the number of model parallel GPUs. + + In order to support various loading methods, we ensure that LoRA-added + embeddings are always at the end of TP-sharded tensors. In other words, + we shard base embeddings and LoRA embeddings separately (both padded), + and place them in the same tensor. + In this example, we will have the original vocab size = 1010, + added vocab size = 16 and padding to 64. Therefore, the total + vocab size with padding will be 1088 (because we first pad 1010 to + 1024, add 16, and then pad to 1088). + Therefore, the tensor format looks like the following: + TP1, rank 0 (no sharding): + |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >| + corresponding token_id: | 0 | 1 | ... | 1009 | -1 | ... | -1 | 1010 | ... | 1015 | -1 | ... | -1 | + index: | 0 | 1 | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 | + + TP2, rank 0: + |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >| + corresponding token_id: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 1000 | ... | 1015 | -1 | ... | -1 | + index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 527 | 520 | ... | 543 | + TP2, rank 1: + |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >| + corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1 | ... | -1 | -1 | ... | -1 | -1 | ... | -1 | + index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 519 | 520 | ... | 543 | + Parameters: + org_vocab_start_index //base embeddings start + org_vocab_end_index //base embeddings end + num_org_vocab_padding //base embeddings padding + added_vocab_start_index //LoRA embeddings start + added_vocab_end_index //LoRA embeddings end + */ +{ + // Input validation + TORCH_CHECK(input.dim() >= 1, "input must have at least 1 dimension"); + TORCH_CHECK(org_vocab_start_index >= 0, "org_vocab_start_index must be non-negative"); + TORCH_CHECK(org_vocab_end_index >= org_vocab_start_index, "org_vocab_end_index must be greater than org_vocab_start_index"); + TORCH_CHECK(num_org_vocab_padding >= 0, "num_org_vocab_padding must be non-negative"); + TORCH_CHECK(added_vocab_start_index >= org_vocab_end_index, "added_vocab_start_index must be greater than org_vocab_end_index"); + TORCH_CHECK(added_vocab_end_index >= added_vocab_start_index, "added_vocab_end_index must be greater than added_vocab_start_index"); + + // Get total number of elements + int64_t size = input.numel(); + + // Create output tensors + at::Tensor masked_input = at::empty_like(input); + at::Tensor mask = at::empty_like(input).to(at::kBool); + + // Get data pointers + void *input_ptr = input.data_ptr(); + void *masked_input_ptr = masked_input.data_ptr(); + void *mask_ptr = mask.data_ptr(); + + // Get current stream + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); + + // Get scalar type + at::ScalarType scalar_type = input.scalar_type(); + + // Create and configure OpCommand + at_npu::native::OpCommand cmd; + cmd.Name("get_masked_input_and_mask"); + cmd.SetCustomHandler([scalar_type, size, stream, + input_ptr, masked_input_ptr, mask_ptr, + org_vocab_start_index, org_vocab_end_index, + num_org_vocab_padding, added_vocab_start_index, + added_vocab_end_index]() -> int { + // Get platform info + fe::PlatFormInfos platform_infos; + int device_id = 0; + fe::PlatformInfoManager::GeInstance().GetRuntimePlatformInfosByDevice(device_id, platform_infos); + uint32_t aivNum = platform_infos.GetCoreNumByType("aiv"); + uint32_t loop_cnt = (size + aivNum - 1) / aivNum; + + // Call implementation + get_masked_input_and_mask_impl( + stream, + input_ptr, + masked_input_ptr, + mask_ptr, + org_vocab_start_index, + org_vocab_end_index, + num_org_vocab_padding, + added_vocab_start_index, + added_vocab_end_index, + size, + loop_cnt, + aivNum); + + return 0; + }); + cmd.Run(); + return {masked_input, mask}; +} + void verify_tensor(std::string const& name, at::Tensor const& t, int64_t const size_0, int64_t const size_1, c10::ScalarType const type) { @@ -194,6 +300,16 @@ TORCH_LIBRARY_EXPAND(_C, ops) " Tensor! key, int head_size," " Tensor cos_sin_cache, bool is_neox) -> (Tensor query, Tensor key)"); ops.impl("rotary_embedding", torch::kPrivateUse1, &vllm_ascend::rotary_embedding); + + ops.def( + "get_masked_input_and_mask(Tensor input, " + " int org_vocab_start_index, " + " int org_vocab_end_index, " + " int num_org_vocab_padding, " + " int added_vocab_start_index, " + " int added_vocab_end_index) -> (Tensor masked_input, Tensor mask)"); + ops.impl("get_masked_input_and_mask", torch::kPrivateUse1, &vllm_ascend::get_masked_input_and_mask); + ops.def( "advance_step_flashattn_ascendc(int num_seqs, int num_queries, int block_size," " Tensor! input_tokens, Tensor! sampled_token_ids, Tensor! input_positions," diff --git a/tests/ops/test_vocabparallelembedding.py b/tests/ops/test_vocabparallelembedding.py new file mode 100644 index 000000000..97d6c7059 --- /dev/null +++ b/tests/ops/test_vocabparallelembedding.py @@ -0,0 +1,91 @@ +from typing import Tuple + +import pytest +import torch +import torch_npu # noqa: F401 + +import vllm_ascend.platform # noqa: F401 + +# Test parameters +DTYPES = [torch.int32] +#SHAPES = [(100,), (5, 20), (3, 4, 5)] # Various tensor shapes +#SHAPES = [(3, 4, 8), (3, 4, 5)] # Various tensor shapes +SHAPES = [(3, 4, 3)] +DEVICES = [f"npu:{0}"] +SEEDS = [0] + + +def get_masked_input_and_mask_ref( + input_: torch.Tensor, org_vocab_start_index: int, + org_vocab_end_index: int, num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Reference implementation for verification""" + org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < + org_vocab_end_index) + added_vocab_mask = (input_ >= added_vocab_start_index) & ( + input_ < added_vocab_end_index) + added_offset = added_vocab_start_index - ( + org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding + valid_offset = (org_vocab_start_index * + org_vocab_mask) + (added_offset * added_vocab_mask) + vocab_mask = org_vocab_mask | added_vocab_mask + masked_input = vocab_mask * (input_ - valid_offset) + return masked_input, ~vocab_mask + + +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEEDS) +@torch.inference_mode() +def test_get_masked_input_and_mask( + shape: Tuple[int, ...], + dtype: torch.dtype, + device: str, + seed: int, +) -> None: + # Set random seed + torch.manual_seed(seed) + torch.set_default_device(device) + + # Generate random input tensor + input_tensor = torch.randint(0, 1000, shape, dtype=dtype) + + # Test parameters + test_case = { + "org_start": 100, + "org_end": 200, + "padding": 0, + "added_start": 300, + "added_end": 400, + } + + # Get reference result + ref_masked_input, ref_mask = get_masked_input_and_mask_ref( + input_tensor, test_case["org_start"], test_case["org_end"], + test_case["padding"], test_case["added_start"], test_case["added_end"]) + + # Get custom op result + print("input_tensor:", input_tensor) + custom_masked_input, custom_mask = torch.ops._C.get_masked_input_and_mask( + input_tensor, test_case["org_start"], test_case["org_end"], + test_case["padding"], test_case["added_start"], test_case["added_end"]) + + ref_masked_input = ref_masked_input.to(dtype) + print("custom_masked_input:", custom_masked_input) + print("ref_masked_input:", ref_masked_input) + print("custom_mask:", custom_mask) + print("ref_mask:", ref_mask) + # Compare results + torch.testing.assert_close( + custom_masked_input, + ref_masked_input, + rtol=1e-5, + atol=1e-5, + msg=f"Masked input mismatch for case: {test_case}") + torch.testing.assert_close(custom_mask, + ref_mask, + rtol=1e-5, + atol=1e-5, + msg=f"Mask mismatch for case: {test_case}") From 9bea014982c61f22ef803adf0ed840cb4faf6080 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Thu, 12 Jun 2025 10:46:41 +0800 Subject: [PATCH 17/28] [CI][Benchmark] Add new model and v1 test to perf benchmarks (#1099) ### What this PR does / why we need it? - Add qwen2.5-7b-instruct test - Add v1 test --------- Signed-off-by: wangli Signed-off-by: wangxiaoxin (A) --- .github/workflows/nightly_benchmarks.yaml | 28 ++++++++++++++--------- benchmarks/tests/latency-tests.json | 10 ++++++++ benchmarks/tests/serving-tests.json | 24 +++++++++++++++++++ benchmarks/tests/throughput-tests.json | 11 +++++++++ 4 files changed, 62 insertions(+), 11 deletions(-) diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 4f00e21df..5b0d3d4a3 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -41,13 +41,18 @@ jobs: test: if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} - name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }} + name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}, use_v1=${{ matrix.vllm_use_v1 }} runs-on: 'linux-arm64-npu-static-8' strategy: matrix: include: - vllm_branch: v0.9.1 vllm_ascend_branch: main + vllm_use_v1: 0 + - vllm_branch: v0.9.0 + vllm_ascend_branch: main + vllm_use_v1: 1 + max-parallel: 1 container: image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 volumes: @@ -67,6 +72,7 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }} ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }} + VLLM_USE_V1: ${{ matrix.vllm_use_v1 }} steps: - name: Check npu and CANN info run: | @@ -136,7 +142,7 @@ jobs: - name: Install elastic_tool if: github.event_name != 'pull_request' run: | - pip install escli-tool==0.2.1 + pip install escli-tool==0.2.2 - name: Collect pr info from vllm-project/vllm-ascend if: github.event_name != 'pull_request' @@ -173,17 +179,17 @@ jobs: echo "vllm branch: ${{ matrix.vllm_branch }}" echo "vllm-ascend branch: ${{ matrix.vllm_ascend_branch }}" echo "------------------------" + cd /github/home bash benchmarks/scripts/run-performance-benchmarks.sh # send the result to es - if [[ "${{ github.event_name }}" != "pull request" ]]; then - escli add --vllm_branch ${{ matrix.vllm_branch }} \ - --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \ - --commit_id $commit_id \ - --commit_title "$commit_title" \ - --created_at "$commit_time_no_tz" \ - --res_dir ./benchmarks/results - rm -rf ./benchmarks/results - fi + escli add --vllm_branch ${{ matrix.vllm_branch }} \ + --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \ + --commit_id $commit_id \ + --commit_title "$commit_title" \ + --created_at "$commit_time_no_tz" \ + --res_dir ./benchmarks/results \ + --extra_feat '{"VLLM_USE_V1": "${{ matrix.vllm_use_v1 }}"}' + rm -rf ./benchmarks/results cd - done < commit_log.txt diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json index 576ced213..40cec4c5f 100644 --- a/benchmarks/tests/latency-tests.json +++ b/benchmarks/tests/latency-tests.json @@ -9,5 +9,15 @@ "num_iters_warmup": 5, "num_iters": 15 } + }, + { + "test_name": "latency_qwen2_5_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } } ] diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json index d8ad2be2b..c8d5cdaf9 100644 --- a/benchmarks/tests/serving-tests.json +++ b/benchmarks/tests/serving-tests.json @@ -49,5 +49,29 @@ "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } + }, + { + "test_name": "serving_qwen2_5_7B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } } ] diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json index 551d23882..3698e69f3 100644 --- a/benchmarks/tests/throughput-tests.json +++ b/benchmarks/tests/throughput-tests.json @@ -22,6 +22,17 @@ "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", "num_prompts": 200 } + }, + { + "test_name": "throughput_qwen2_5_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } } ] From d6aacdff0b776b233164872ee6bb5aedd0aa9d69 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Thu, 12 Jun 2025 10:47:30 +0800 Subject: [PATCH 18/28] [CI][Benchmark] Add qwen2.5-7b test (#1104) ### What this PR does / why we need it? - Add qwen2.5-7b performance benchmark, this is a sub pr of #1099, for v1 test, need more verify - Fix get commit time after checkout --------- Signed-off-by: wangli Signed-off-by: wangxiaoxin (A) --- .github/workflows/nightly_benchmarks.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 5b0d3d4a3..116912111 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -166,10 +166,10 @@ jobs: while IFS= read -r line || [[ -n "$line" ]]; do commit_id=${line%% *} commit_title=${line#* } - commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict) - commit_time_no_tz=${commit_time::19} git checkout $commit_id + commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict) + commit_time_no_tz=${commit_time::19} pip install -e . echo "------------------------" From e0ef036b775f9d068e1a90c40b38a5e169d4d22e Mon Sep 17 00:00:00 2001 From: wangyanhui-cmss Date: Thu, 12 Jun 2025 19:40:58 +0800 Subject: [PATCH 19/28] [fix] fix bug in 1p1d disaggregated_prefill example (#1184) ### What this PR does / why we need it? fix bug in 1p1d disaggregated_prefill example ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Tested with python find_device_ips.py and run disaggregated_prefill example Signed-off-by: wangyanhui-cmss Signed-off-by: wangxiaoxin (A) --- .../disaggregated_prefill/find_device_ips.py | 58 ++++++++++--------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/examples/disaggregated_prefill/find_device_ips.py b/examples/disaggregated_prefill/find_device_ips.py index 205afbf78..48fd7b9d3 100644 --- a/examples/disaggregated_prefill/find_device_ips.py +++ b/examples/disaggregated_prefill/find_device_ips.py @@ -30,38 +30,40 @@ HCCN_TOOL_PATH = envs.HCCN_PATH -def get_device_ips(world_size: int): - npu_info = subprocess.run( - ["npu-smi", "info", "-m"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - ) +def get_device_ips(): + npu_info = subprocess.run(['npu-smi', 'info', '-m'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True) if npu_info.returncode != 0 or not os.path.exists(HCCN_TOOL_PATH): raise RuntimeError("No npu-smi/hccn_tool tools provided for NPU.") - npu_start_idx = int( - re.match(r".*\n\t([0-9]+).*", - npu_info.stdout).group(1)) # type: ignore + + # ‌Extract NPU IDs for all Ascend devices (excluding Mcu rows) + device_ids = [] + for line in npu_info.stdout.strip().split('\n'): + match = re.match(r'^\s*(\d+)\s+\d+\s+\d+\s+Ascend', line) + if match: + device_ids.append(int(match.group(1))) + + if not device_ids: + raise RuntimeError( + "Cannot parse any valid device ID from npu-smi output.") + device_ip_list = [] - for ip_offset in range(world_size): - cmd = [ - HCCN_TOOL_PATH, - "-i", - f"{npu_start_idx + ip_offset}", - "-ip", - "-g", - ] - device_ip_info = subprocess.run( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - ) - device_ip = re.match(r"ipaddr:(.*)\n", - device_ip_info.stdout).group(1) # type: ignore + for device_id in device_ids: + cmd = [HCCN_TOOL_PATH, '-i', str(device_id), '-ip', '-g'] + device_ip_info = subprocess.run(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True) + ip_match = re.search(r'ipaddr:(.*)', device_ip_info.stdout) + if not ip_match: + raise RuntimeError( + f"Cannot parse IP from hccn_tool for device {device_id}") + device_ip = ip_match.group(1).strip() device_ip_list.append(device_ip) + return device_ip_list -# Pass number of NPUs into this function. -print(get_device_ips(8)) +print(get_device_ips()) From f0ee1c3e066efe2946fbc09490f65f402b0881a0 Mon Sep 17 00:00:00 2001 From: Wan_Danfeng Date: Thu, 12 Jun 2025 21:22:23 +0800 Subject: [PATCH 20/28] [Doc] Add Referer header for CANN package download url. (#1192) ### What this PR does / why we need it? fix the CANN download url ### Does this PR introduce _any_ user-facing change? no, do not have any user-facing change ### How was this patch tested? run the **wget** command and cann package is rightly downloaded. --------- Signed-off-by: wan_danfeng Signed-off-by: wangxiaoxin (A) --- docs/source/installation.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/installation.md b/docs/source/installation.md index 8fbe1825d..c9f684491 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -78,17 +78,17 @@ source vllm-ascend-env/bin/activate pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple attrs 'numpy<2.0.0' decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions # Download and install the CANN package. -wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run chmod +x ./Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run ./Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run --full source /usr/local/Ascend/ascend-toolkit/set_env.sh -wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run chmod +x ./Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run ./Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run --install -wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run chmod +x ./Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run ./Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run --install From 634325c83dce2f1befc2152f749eabcfaaf94941 Mon Sep 17 00:00:00 2001 From: sdmyzlp <117554856+sdmyzlp@users.noreply.github.com> Date: Thu, 12 Jun 2025 21:42:09 +0800 Subject: [PATCH 21/28] Support multistream of MLA vector operations (#1135) ### What this PR does / why we need it? Move all vector operations to a secondary stream, with the expected overlaping being: ``` | q_rmsnorm | | kv_norm_rope_cache | | q_rope | | matmul W_DQ | matmul W_DKV | index | index | matmul W_UQ | split | matmul W_KV_T | ``` Currently, the `IndexByTensor` operators introduced by computation of `cos` and `sin` can't be offloaded to the secondary stream due to a known bug of graph fusion optimization pass. So we instead keep it in the main stream, only requires it be computed before `matmul W_UQ` to avoid hindering later overlapping. The problem may be solved by later optimization (#993), which hoists the computation of `cos` and `sin` up to the first layer. ### Does this PR introduce _any_ user-facing change? Controlled by `torchair_graph_config.enable_multistream_mla`, defaulted to False. ### How was this patch tested? Tested on 1x16 910 node, with tailored 2 layer DSKv2. Signed-off-by: sdmyzlp Signed-off-by: wangxiaoxin (A) --- docs/source/user_guide/additional_config.md | 1 + tests/singlecard/test_ascend_config.py | 2 + vllm_ascend/ascend_config.py | 2 + vllm_ascend/attention/mla_v1.py | 56 ++++++++++++++------- vllm_ascend/models/deepseek_v2.py | 14 +++++- 5 files changed, 56 insertions(+), 19 deletions(-) diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index b1041bf17..46adac332 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -40,6 +40,7 @@ The details of each config option are as follows: | Name | Type | Default | Description | | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable torchair graph mode | +| `enable_multistream_mla`| bool | `False` | Whether to put vector ops of MLA to another stream | | `enable_multistream_moe`| bool | `False` | Whether to enable multistream shared expert | | `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization | | `use_cached_graph` | bool | `False` | Whether to use cached graph | diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py index 818745f30..63484d4a0 100644 --- a/tests/singlecard/test_ascend_config.py +++ b/tests/singlecard/test_ascend_config.py @@ -59,6 +59,7 @@ def test_run_with_ascend_config(): "graph_batch_sizes": [1, 2, 4, 8], "graph_batch_sizes_init": False, "enable_multistream_moe": True, + "enable_multistream_mla": True, }, "ascend_scheduler_config": { "enabled": True, @@ -79,6 +80,7 @@ def test_run_with_ascend_config(): 1, 2, 4, 8 ] assert not ascend_config.torchair_graph_config.graph_batch_sizes_init + assert ascend_config.torchair_graph_config.enable_multistream_mla assert ascend_config.torchair_graph_config.enable_multistream_moe assert ascend_config.ascend_scheduler_config.enabled assert ascend_config.ascend_scheduler_config.enable_chunked_prefill diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 67c9842df..defa7fd3d 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -56,6 +56,8 @@ def __init__(self, torchair_graph_config): "graph_batch_sizes", []) self.graph_batch_sizes_init = torchair_graph_config.get( "graph_batch_sizes_init", False) + self.enable_multistream_mla = torchair_graph_config.get( + "enable_multistream_mla", False) self.enable_multistream_moe = torchair_graph_config.get( "enable_multistream_moe", False) self.enable_view_optimize = torchair_graph_config.get( diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index b17d65c04..43cb71c68 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -20,6 +20,7 @@ from vllm_ascend.multistream.context import get_multistream_comm_context from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla +from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput @@ -557,6 +558,9 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz + self.enable_multistream_mla = \ + ascend_config.torchair_graph_config.enable_multistream_mla + # Adapt torch air graph mode with spec decoding. speculative_config = get_current_vllm_config().speculative_config if speculative_config is not None: @@ -861,17 +865,20 @@ def exec_kv( # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) cache_mode = "PA_NZ" if self.enable_kv_nz else "PA" - k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache( - kv, - self.kv_a_layernorm.weight, - cos, - sin, - slots.to(torch.int64), - kv_cache[1], - kv_cache[0], - epsilon=self.kv_a_layernorm.variance_epsilon, - cache_mode=cache_mode, - ) + with npu_stream_switch("mla_secondary", + 0, + enabled=self.enable_multistream_mla): + k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache( + kv, + self.kv_a_layernorm.weight, + cos, + sin, + slots.to(torch.int64), + kv_cache[1], + kv_cache[0], + epsilon=self.kv_a_layernorm.variance_epsilon, + cache_mode=cache_mode, + ) return k_pe, k_nope def exec_kv_prefill( @@ -1064,23 +1071,38 @@ def forward( if has_decode: decode_k_nope = None assert attn_metadata.decode is not None - decode_ql_nope, decode_q_pe = \ - self._q_proj_and_k_up_proj(decode_hs_or_q_c) if self.running_in_graph: seq_len = self.rotary_emb.max_position_embeddings cos = self.rotary_emb.cos_cached[:seq_len].to( - dtype=decode_q_pe.dtype) + dtype=decode_hs_or_q_c.dtype) sin = self.rotary_emb.sin_cached[:seq_len].to( - dtype=decode_q_pe.dtype) + dtype=decode_hs_or_q_c.dtype) cos = cos[attn_metadata.decode.input_positions] sin = sin[attn_metadata.decode.input_positions] cos = cos[:, None, None, :] sin = sin[:, None, None, :] - - decode_q_pe = self.rope_single(decode_q_pe, cos, sin) + # Without explicitly controlling the order, IndexByTensor operations + # would be placed after `matmul W_KV_T` hindering the overlapping of + # KvRmsNormRopeCache and SingleRope. + npu_wait_tensor(decode_hs_or_q_c, + cos, + enabled=self.enable_multistream_mla) + npu_wait_tensor(decode_hs_or_q_c, + sin, + enabled=self.enable_multistream_mla) + decode_ql_nope, decode_q_pe = \ + self._q_proj_and_k_up_proj(decode_hs_or_q_c) + if self.running_in_graph: decode_k_pe, decode_k_nope = self.exec_kv( hidden_states_or_kv_c_normed, cos, sin, kv_cache, attn_metadata.slot_mapping) + with npu_stream_switch("mla_secondary", + 0, + enabled=self.enable_multistream_mla): + npu_wait_tensor(decode_q_pe, + decode_k_pe, + enabled=self.enable_multistream_mla) + decode_q_pe = self.rope_single(decode_q_pe, cos, sin) else: decode_q_pe[...], decode_k_pe[...] = self.rotary_emb( attn_metadata.decode.input_positions, diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index a83ca4751..0ae1142a3 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -71,7 +71,8 @@ from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.quantization.quant_config import AscendLinearMethod from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod -from vllm_ascend.utils import dispose_tensor +from vllm_ascend.utils import (dispose_tensor, npu_stream_switch, + npu_wait_tensor) VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 @@ -496,6 +497,8 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_multistream_mla = \ + ascend_config.torchair_graph_config.enable_multistream_mla def forward( self, @@ -505,7 +508,14 @@ def forward( attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: if self.q_lora_rank is not None: ckq = self.q_a_proj(hidden_states)[0] - hidden_states_or_q_c = self.q_a_layernorm(ckq) + use_multistream_mla = (self.enable_multistream_mla + and attn_metadata is not None + and attn_metadata.num_decodes > 0) + npu_wait_tensor(hidden_states, ckq, enabled=use_multistream_mla) + with npu_stream_switch("mla_secondary", + 0, + enabled=use_multistream_mla): + hidden_states_or_q_c = self.q_a_layernorm(ckq) else: hidden_states_or_q_c = hidden_states if self.torchair_graph_enabled: From cd55d0cb8dbd005fb19b15846139475bc7d173ec Mon Sep 17 00:00:00 2001 From: whx <56632993+whx-sjtu@users.noreply.github.com> Date: Fri, 13 Jun 2025 07:51:23 +0800 Subject: [PATCH 22/28] [CI] Recover ut for ascend scheduler only in ci of v1. (#1180) Last PR [#943 ](https://github.com/vllm-project/vllm-ascend/pull/943) wrongly open ut of AscendScheduler in V0 ci, this PR fixes this problem and only run ut of it in V1 ci. Signed-off-by: whx-sjtu <2952154980@qq.com> Signed-off-by: wangxiaoxin (A) --- .github/workflows/vllm_ascend_test.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 54a3249e4..c474fc8e7 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -183,8 +183,8 @@ jobs: # test_ascend_config.py should be ran separately because it will regenerate the global config many times. pytest -sv tests/singlecard/test_ascend_config.py pytest -sv tests/singlecard/test_camem.py - # pytest -sv tests/singlecard/core/test_ascend_scheduler.py - # pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py + pytest -sv tests/singlecard/core/test_ascend_scheduler.py + pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py pytest -sv tests/singlecard/ \ --ignore=tests/singlecard/test_offline_inference.py \ --ignore=tests/singlecard/test_guided_decoding.py \ @@ -215,7 +215,6 @@ jobs: # test_ascend_config.py should be ran separately because it will regenerate the global config many times. pytest -sv tests/singlecard/test_ascend_config.py pytest -sv tests/singlecard/test_prompt_embedding.py - pytest -sv tests/singlecard/core/test_ascend_scheduler.py pytest -sv tests/singlecard/ \ --ignore=tests/singlecard/test_offline_inference.py \ --ignore=tests/singlecard/test_guided_decoding.py \ From a4294bb0fd51c9392cb5c94c24a94e64ba9f17c9 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Fri, 13 Jun 2025 18:25:50 +0800 Subject: [PATCH 23/28] Add ShouJian Zheng (@jianzs) as vLLM Ascend maintainer (#1203) ### What this PR does / why we need it? Add @jianzs as vLLM Ascend maintainer @jianzs ---- I would like to nominate Shoujian Zheng (@jianzs ) as a maintainer, starting with my +1. - He focuses on the code quality and good design with solid reviews in P/D disaggregation and DeepSeek improvement area about 30+ high quality review, such as #issuecomment-2811764833, #discussion_r2069927605 and #pullrequestreview-2820996674. This is the most important reason why I nominated him, because helping community developers complete PRs with high quality and continuously ensure the quality of codebase is one of the important responsibilities of a maintainer. We believe he is a great addition. - Shoujian's main expertise is distributed inference. He has a lot of experience in production about AI infra. He has very good habits and explains in great detail all changes #issue-3023082580 anqd share results open: #issuecomment-2853140443. And High quality PR: #706, #774, #852. - Community Involvement: Active involved in community discussion, he is collaborative and helps the users solve problems, involved in 30+ PR and issue, such as #issuecomment-2911934292 and #issuecomment-2833523571. Reference: [1] https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html [2] https://vllm-ascend.readthedocs.io/en/latest/community/governance.html Signed-off-by: Yikun Jiang Signed-off-by: wangxiaoxin (A) --- docs/source/community/contributors.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/community/contributors.md b/docs/source/community/contributors.md index d61259de8..16ea600d6 100644 --- a/docs/source/community/contributors.md +++ b/docs/source/community/contributors.md @@ -7,6 +7,7 @@ | Xiyuan Wang| [@wangxiyuan](https://github.com/wangxiyuan) | 2025/01 | | Yikun Jiang| [@Yikun](https://github.com/Yikun) | 2025/02 | | Yi Gan| [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/02 | +| Shoujian Zheng| [@jianzs](https://github.com/jianzs) | 2025/06 | ## Contributors From ca0ab3603f462f4d2dfeda7af99af7f50f8a66bb Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Sat, 14 Jun 2025 16:59:00 +0800 Subject: [PATCH 24/28] [CI/UT][Graph] Add ut for torchair graph mode (#1103) ### What this PR does / why we need it? Add ut for torchair graph mode on DeepSeekV3 ### How was this patch tested? CI passed with new added test. --------- Signed-off-by: MengqingCao Signed-off-by: Mengqing Cao Signed-off-by: wangxiaoxin (A) --- docs/source/user_guide/additional_config.md | 18 ++--- docs/source/user_guide/graph_mode.md | 5 +- tests/conftest.py | 9 ++- tests/multicard/test_torchair_graph_mode.py | 80 +++++++++++++++++++++ 4 files changed, 100 insertions(+), 12 deletions(-) create mode 100644 tests/multicard/test_torchair_graph_mode.py diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index 46adac332..d4756ef5e 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -54,7 +54,7 @@ The details of each config option are as follows: | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine| -ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `chunked_prefill_enabled: true` to ascend_scheduler_config as well. +ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well. ### Example @@ -63,18 +63,18 @@ A full example of additional configuration is as follows: ``` { "torchair_graph_config": { - "enabled": true, - "use_cached_graph": true, + "enabled": True, + "use_cached_graph": True, "graph_batch_sizes": [1, 2, 4, 8], - "graph_batch_sizes_init": false, - "enable_multistream_moe": false, - "enable_kv_nz": false + "graph_batch_sizes_init": False, + "enable_multistream_moe": False, + "enable_kv_nz": False }, "ascend_scheduler_config": { - "enabled": true, - "chunked_prefill_enabled": true, + "enabled": True, + "enable_chunked_prefill": True, }, "expert_tensor_parallel_size": 1, - "refresh": false, + "refresh": False, } ``` diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md index 6831809f2..161b89a0d 100644 --- a/docs/source/user_guide/graph_mode.md +++ b/docs/source/user_guide/graph_mode.md @@ -47,14 +47,15 @@ from vllm import LLM os.environ["VLLM_USE_V1"] = 1 -model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True}}) +# TorchAirGraph is only work without chunked-prefill now +model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True,}}) outputs = model.generate("Hello, how are you?") ``` online example: ```shell -vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enabled": true}}' +vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True,}}' ``` You can find more detail about additional config [here](./additional_config.md) diff --git a/tests/conftest.py b/tests/conftest.py index 16bbc8027..e0d70a19d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,7 @@ # Adapted from vllm-project/vllm/blob/main/tests/conftest.py # +import contextlib import gc from typing import List, Optional, Tuple, TypeVar, Union @@ -53,11 +54,17 @@ PromptVideoInput = _PromptMultiModalInput[np.ndarray] -def cleanup_dist_env_and_memory(): +def cleanup_dist_env_and_memory(shutdown_ray: bool = False): destroy_model_parallel() destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + if shutdown_ray: + import ray # Lazy import Ray + ray.shutdown() gc.collect() torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() class VllmRunner: diff --git a/tests/multicard/test_torchair_graph_mode.py b/tests/multicard/test_torchair_graph_mode.py new file mode 100644 index 000000000..d06ec7de2 --- /dev/null +++ b/tests/multicard/test_torchair_graph_mode.py @@ -0,0 +1,80 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +"""Compare the short outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/multicard/test_torchair_graph_mode.py`. +""" +import os + +import pytest + +from tests.conftest import VllmRunner + +os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" + + +@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", + reason="torchair graph is not supported on v0") +def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + + example_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + dtype = "half" + max_tokens = 5 + # torchair is only work without chunked-prefill now + with VllmRunner( + "vllm-ascend/DeepSeek-V3-Pruning", + dtype=dtype, + tensor_parallel_size=4, + distributed_executor_backend="mp", + additional_config={ + "torchair_graph_config": { + "enabled": True, + }, + "ascend_scheduler_config": { + "enabled": True, + }, + "refresh": True, + }, + enforce_eager=False, + ) as vllm_model: + # use greedy sampler to make sure the generated results are fix + vllm_output = vllm_model.generate_greedy(example_prompts, + max_tokens) + # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of + # DeepSeek-V3 with 2 hidden layers, thus the golden results seems + # inaccurate. This will only change if accuracy improves with the + # official weights of DeepSeek-V3. + golden_results = [ + 'Hello, my name is feasibility伸 spazio debtor添', + 'The president of the United States is begg"""\n杭州风和 bestimm', + 'The capital of France is frequentlyশามalinkAllowed', + 'The future of AI is deleting俯احت怎么样了حراف', + ] + + assert len(golden_results) == len(vllm_output) + for i in range(len(vllm_output)): + assert golden_results[i] == vllm_output[i][1] + print(f"Generated text: {vllm_output[i][1]!r}") From fa22e88212941d9598afae4ab760eb4cddce9db7 Mon Sep 17 00:00:00 2001 From: 22dimensions Date: Sun, 15 Jun 2025 15:41:11 +0800 Subject: [PATCH 25/28] [Doc] fix VLLM_USE_V1 value in graph mode docs (#1226) os.environ["VLLM_USE_V1"] must be assigned with str, not other type. ![image](https://github.com/user-attachments/assets/9d337ae5-00e5-4179-832e-c6c917dd5798) Signed-off-by: 22dimensions Signed-off-by: wangxiaoxin (A) --- docs/source/user_guide/graph_mode.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md index 161b89a0d..38d356249 100644 --- a/docs/source/user_guide/graph_mode.md +++ b/docs/source/user_guide/graph_mode.md @@ -23,7 +23,7 @@ import os from vllm import LLM -os.environ["VLLM_USE_V1"] = 1 +os.environ["VLLM_USE_V1"] = "1" model = LLM(model="Qwen/Qwen2-7B-Instruct") outputs = model.generate("Hello, how are you?") @@ -45,7 +45,7 @@ offline example: import os from vllm import LLM -os.environ["VLLM_USE_V1"] = 1 +os.environ["VLLM_USE_V1"] = "1" # TorchAirGraph is only work without chunked-prefill now model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True,}}) @@ -70,7 +70,7 @@ offline example: import os from vllm import LLM -os.environ["VLLM_USE_V1"] = 1 +os.environ["VLLM_USE_V1"] = "1" model = LLM(model="someother_model_weight", enforce_eager=True) outputs = model.generate("Hello, how are you?") From 45e802ce08c4a85d37e3356f6d9edf10e510834e Mon Sep 17 00:00:00 2001 From: zhuo97 <49392868+zhuo97@users.noreply.github.com> Date: Mon, 16 Jun 2025 21:03:16 +0800 Subject: [PATCH 26/28] Fix the device error when using ray as vllm-acend backend (#884) 1. Remove RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES 2. Add lazy init for vllm_ascend_C Signed-off-by: zhuo97 <1103045176@qq.com> Signed-off-by: wangxiaoxin (A) --- examples/offline_multi_step_custom_ops.py | 3 --- tests/singlecard/ops/test_rotary_embedding.py | 4 ++- vllm_ascend/attention/attention.py | 4 +-- vllm_ascend/ops/rotary_embedding.py | 5 ++-- vllm_ascend/platform.py | 12 --------- vllm_ascend/utils.py | 27 +++++++++++++++++++ vllm_ascend/worker/worker_v1.py | 5 ++++ 7 files changed, 40 insertions(+), 20 deletions(-) diff --git a/examples/offline_multi_step_custom_ops.py b/examples/offline_multi_step_custom_ops.py index 82a1bf575..59c7fafcc 100644 --- a/examples/offline_multi_step_custom_ops.py +++ b/examples/offline_multi_step_custom_ops.py @@ -19,9 +19,6 @@ from vllm import LLM, SamplingParams -import vllm_ascend.platform as pf - -pf.CUSTOM_OP_ENABLED = True # set True for custom Ops of Multi-Step. prompts = [ "Hello, my name is", "The president of the United States is", diff --git a/tests/singlecard/ops/test_rotary_embedding.py b/tests/singlecard/ops/test_rotary_embedding.py index 2d5ec18da..a3504a88b 100644 --- a/tests/singlecard/ops/test_rotary_embedding.py +++ b/tests/singlecard/ops/test_rotary_embedding.py @@ -10,7 +10,9 @@ import torch import torch.nn as nn -import vllm_ascend.platform # noqa: F401 +from vllm_ascend.utils import enable_custom_op + +enable_custom_op() # Only Neox style true scenario is supported for now IS_NEOX_STYLE = [True] diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py index a567cc530..9c112ed62 100644 --- a/vllm_ascend/attention/attention.py +++ b/vllm_ascend/attention/attention.py @@ -36,7 +36,7 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ops.cache import concat_and_cache_mla -from vllm_ascend.platform import CUSTOM_OP_ENABLED +from vllm_ascend.utils import enable_custom_op from vllm_ascend.worker.model_runner import ( ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata) @@ -462,7 +462,7 @@ def advance_step(self, for i in range(num_queries): self.seq_lens[i] += 1 self.max_decode_seq_len = max(self.seq_lens) - if CUSTOM_OP_ENABLED: + if enable_custom_op(): #advance a step on NPU for existing inputs for a multi-step runner if custom ops is enabled torch.ops._C.advance_step_flashattn_ascendc( num_seqs=num_seqs, diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 0c2a00afb..9f8ae784c 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -22,11 +22,12 @@ from vllm.model_executor.layers.rotary_embedding import ( DeepseekScalingRotaryEmbedding, RotaryEmbedding) -from vllm_ascend.platform import CUSTOM_OP_ENABLED +from vllm_ascend.utils import enable_custom_op def custom_rotary_embedding_enabled(query, neox_style, head_size): - return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and CUSTOM_OP_ENABLED + return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and enable_custom_op( + ) def rope_forward_oot( diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 912e375f7..80211ad13 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -16,7 +16,6 @@ # import gc -import logging import os from datetime import timedelta from typing import TYPE_CHECKING, Optional, Tuple @@ -32,16 +31,6 @@ from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes -CUSTOM_OP_ENABLED = False -try: - # register custom ops into torch_library here - import vllm_ascend.vllm_ascend_C # type: ignore # noqa: F401 - CUSTOM_OP_ENABLED = True -except ImportError as e: - logging.warning( - "Failed to import 'vllm_ascend.vllm_ascend_C': %s. All custom ops will be disabled. ", - e) - if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig from vllm.utils import FlexibleArgumentParser @@ -50,7 +39,6 @@ VllmConfig = None FlexibleArgumentParser = None -os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1" os.environ["ACL_OP_INIT_MODE"] = ascend_envs.VLLM_ASCEND_ACL_OP_INIT_MODE diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index f41dab4b9..6c2c6c608 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -53,6 +53,8 @@ ASCEND_QUATIZATION_METHOD = "ascend" +CUSTOM_OP_ENABLED = None + def try_register_lib(lib_name: str, lib_info: str = ""): import importlib @@ -67,6 +69,31 @@ def try_register_lib(lib_name: str, lib_info: str = ""): pass +def enable_custom_op(): + """ + Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component. + Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device(). + """ + global CUSTOM_OP_ENABLED + + if CUSTOM_OP_ENABLED is not None: + return CUSTOM_OP_ENABLED + + else: + try: + # register custom ops into torch_library here + import vllm_ascend.vllm_ascend_C # type: ignore # noqa: F401 + CUSTOM_OP_ENABLED = True + + except ImportError: + CUSTOM_OP_ENABLED = False + logger.warning( + "Warning: Failed to register custom ops, all custom ops will be disabled" + ) + + return CUSTOM_OP_ENABLED + + def find_hccl_library() -> str: """ We either use the library file specified by the `HCCL_SO_PATH` diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index ba29dcfc5..ebdf01e15 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -117,6 +117,11 @@ def wake_up(self, tags: Optional[list[str]] = None) -> None: allocator = CaMemAllocator.get_instance() allocator.wake_up(tags=tags) + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + def init_device(self): if self.device_config.device.type == "npu": self.device = torch.device(f"npu:{self.local_rank_across_dp}") From 94d0f0782ba5699c2335a992c5f4b8f291f8d0cf Mon Sep 17 00:00:00 2001 From: "wangxiaoxin (A)" Date: Tue, 17 Jun 2025 17:22:09 +0800 Subject: [PATCH 27/28] remove main vll verison. Signed-off-by: wangxiaoxin (A) --- .github/workflows/vllm_ascend_test.yaml | 2 +- .github/workflows/vllm_ascend_test_long_term.yaml | 2 +- .github/workflows/vllm_ascend_test_pd.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index c474fc8e7..9245820ff 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -113,7 +113,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [main, v0.9.1] + vllm_version: [v0.9.1] concurrency: group: > ${{ diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index 4b5253c72..7ac0a825b 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -40,7 +40,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [main, v0.9.1] + vllm_version: [v0.9.1] name: vLLM Ascend long term test runs-on: ${{ matrix.os }} container: diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml index 84800c7e3..e44d60704 100644 --- a/.github/workflows/vllm_ascend_test_pd.yaml +++ b/.github/workflows/vllm_ascend_test_pd.yaml @@ -38,7 +38,7 @@ jobs: if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }} strategy: matrix: - vllm_verison: [main, v0.9.1] + vllm_verison: [v0.9.1] name: vLLM Ascend prefilling decoding disaggregation test runs-on: linux-arm64-npu-static-8 From f9491b873167f138a731a672d9b2df498bace713 Mon Sep 17 00:00:00 2001 From: "wangxiaoxin (A)" Date: Tue, 17 Jun 2025 17:26:36 +0800 Subject: [PATCH 28/28] Revert "Add ShouJian Zheng (@jianzs) as vLLM Ascend maintainer (#1203)" This reverts commit 70864b6759106c75ccad5fb7324203a214985855. Signed-off-by: wangxiaoxin (A) --- docs/source/community/contributors.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/community/contributors.md b/docs/source/community/contributors.md index 16ea600d6..d61259de8 100644 --- a/docs/source/community/contributors.md +++ b/docs/source/community/contributors.md @@ -7,7 +7,6 @@ | Xiyuan Wang| [@wangxiyuan](https://github.com/wangxiyuan) | 2025/01 | | Yikun Jiang| [@Yikun](https://github.com/Yikun) | 2025/02 | | Yi Gan| [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/02 | -| Shoujian Zheng| [@jianzs](https://github.com/jianzs) | 2025/06 | ## Contributors