From 5cd5d64242ab04ec2014c1a4dc726168a2ca4d6d Mon Sep 17 00:00:00 2001 From: 22dimensions Date: Tue, 10 Jun 2025 10:07:36 +0800 Subject: [PATCH 001/153] [CI] remove old quantization model (#1003) remove old quantization model, and new models will be added to testcase later. Signed-off-by: 22dimensions --- tests/conftest.py | 2 ++ tests/singlecard/test_offline_inference.py | 27 +++++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 7b060f3a07..16bbc8027d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -78,6 +78,7 @@ def __init__( enable_chunked_prefill: bool = False, swap_space: int = 4, enforce_eager: Optional[bool] = True, + quantization: Optional[str] = None, **kwargs, ) -> None: self.model = LLM( @@ -94,6 +95,7 @@ def __init__( max_model_len=max_model_len, block_size=block_size, enable_chunked_prefill=enable_chunked_prefill, + quantization=quantization, **kwargs, ) diff --git a/tests/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py index a65451d4ff..5b58df7928 100644 --- a/tests/singlecard/test_offline_inference.py +++ b/tests/singlecard/test_offline_inference.py @@ -25,6 +25,7 @@ import pytest import vllm # noqa: F401 +from modelscope import snapshot_download # type: ignore[import-untyped] from vllm import SamplingParams from vllm.assets.image import ImageAsset @@ -33,10 +34,13 @@ MODELS = [ "Qwen/Qwen2.5-0.5B-Instruct", - "vllm-ascend/Qwen2.5-0.5B-Instruct-w8a8", "Qwen/Qwen3-0.6B-Base", ] MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"] + +QUANTIZATION_MODELS = [ + "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new", +] os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" @@ -59,6 +63,27 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None: vllm_model.generate_greedy(example_prompts, max_tokens) +@pytest.mark.parametrize("model", QUANTIZATION_MODELS) +@pytest.mark.parametrize("max_tokens", [5]) +def test_quantization_models(model: str, max_tokens: int) -> None: + prompt = "The following numbers of the sequence " + ", ".join( + str(i) for i in range(1024)) + " are:" + example_prompts = [prompt] + + # NOTE: Using quantized model repo id from modelscope encounters an issue, + # this pr (https://github.com/vllm-project/vllm/pull/19212) fix the issue, + # after it is being merged, there's no need to download model explicitly. + model_path = snapshot_download(model) + + with VllmRunner(model_path, + max_model_len=8192, + enforce_eager=True, + dtype="auto", + gpu_memory_utilization=0.7, + quantization="ascend") as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + @pytest.mark.parametrize("model", MULTIMODALITY_MODELS) def test_multimodal(model, prompt_template, vllm_runner): image = ImageAsset("cherry_blossom") \ From 71aee6f97d20fe85bc28a486e9a2061e8fe73a7c Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Tue, 10 Jun 2025 13:29:09 +0800 Subject: [PATCH 002/153] Update 0.9.0rc1 contributors info (#1148) ### What this PR does / why we need it? Update 0.9.0rc1 contributors info ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed Signed-off-by: Yikun Jiang --- docs/source/community/contributors.md | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/docs/source/community/contributors.md b/docs/source/community/contributors.md index efbea66572..d61259de84 100644 --- a/docs/source/community/contributors.md +++ b/docs/source/community/contributors.md @@ -1,4 +1,6 @@ -# Maintainers +# Maintainers and contributors + +## Maintainers | Name | Github ID | Date | |:-----------:|:-----:|:-----:| @@ -6,13 +8,30 @@ | Yikun Jiang| [@Yikun](https://github.com/Yikun) | 2025/02 | | Yi Gan| [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/02 | -# Contributors +## Contributors vLLM Ascend every release would not have been possible without the following contributors: +Updated on 2025-06-10: + | Number | Contributor | Date | Commit ID | |:------:|:-----------:|:-----:|:---------:| -| 51 | [@YisongJiang](https://github.com/YisongJiang) | 2025/5/29 | [90afaf6](https://github.com/vllm-project/vllm-ascend/commit/90afaf6306f680307462becf3c78585737579851) | +| 66 | [@Yuxiao-Xu](https://github.com/Yuxiao-Xu) | 2025/6/9 | [6b853f1](https://github.com/vllm-project/vllm-ascend/commit/6b853f15fe69ba335d2745ebcf14a164d0bcc505) | +| 65 | [@ChenTaoyu-SJTU](https://github.com/ChenTaoyu-SJTU) | 2025/6/7 | [20dedba](https://github.com/vllm-project/vllm-ascend/commit/20dedba5d1fc84b7ae8b49f9ce3e3649389e2193) | +| 64 | [@zxdukki](https://github.com/zxdukki) | 2025/6/7 | [87ebaef](https://github.com/vllm-project/vllm-ascend/commit/87ebaef4e4e519988f27a6aa378f614642202ecf) | +| 63 | [@sdmyzlp](https://github.com/sdmyzlp) | 2025/6/7 | [3640c60](https://github.com/vllm-project/vllm-ascend/commit/3640c60b0eb4d4cb104e20bfa406d3f1d17920a7) | +| 62 | [@weijinqian0](https://github.com/weijinqian0) | 2025/6/7 | [e9ada68](https://github.com/vllm-project/vllm-ascend/commit/e9ada685ece798f9fe0d4a287e3f5246a8a7207b) | +| 61 | [@hahazhky](https://github.com/hahazhky) | 2025/6/6 | [0b12c2a](https://github.com/vllm-project/vllm-ascend/commit/0b12c2acf7d9fd192beebebf662298067d9a5435) | +| 60 | [@depeng1994](https://github.com/depeng1994) | 2025/6/6 | [6b094a2](https://github.com/vllm-project/vllm-ascend/commit/6b094a2bd49a8a41eb3647568b2d9e5b337db81f) | +| 59 | [@David9857](https://github.com/David9857) | 2025/6/5 | [78431b3](https://github.com/vllm-project/vllm-ascend/commit/78431b34694dfa3c8f54ed7cc626660318557927) | +| 58 | [@momo609](https://github.com/momo609) | 2025/6/5 | [908a851](https://github.com/vllm-project/vllm-ascend/commit/908a851a776cfd9051cc062119e6ec481561c6f7) | +| 57 | [@zhangxinyuehfad](https://github.com/zhangxinyuehfad) | 2025/6/5 | [7737aaa](https://github.com/vllm-project/vllm-ascend/commit/7737aaa40f699b233a35fb61e908b687adc1e2e5) | +| 56 | [@NINGBENZHE](https://github.com/NINGBENZHE) | 2025/6/3 | [6ec64a3](https://github.com/vllm-project/vllm-ascend/commit/6ec64a3f9686df65b5a23a41aa301e669db19099) | +| 55 | [@XWFAlone](https://github.com/XWFAlone) | 2025/5/30 | [3442fbd](https://github.com/vllm-project/vllm-ascend/commit/3442fbdb235b4c6d72c2bc64a49707a7bd89958e) | +| 54 | [@YisongJiang](https://github.com/YisongJiang) | 2025/5/29 | [90afaf6](https://github.com/vllm-project/vllm-ascend/commit/90afaf6306f680307462becf3c78585737579851) | +| 53 | [@ponix-j](https://github.com/ponix-j) | 2025/5/23 | [df58fb8](https://github.com/vllm-project/vllm-ascend/commit/df58fb80eee24139fc61c495be3ce79cf81b3f73) | +| 52 | [@ttanzhiqiang](https://github.com/ttanzhiqiang) | 2025/5/23 | [dc6172e](https://github.com/vllm-project/vllm-ascend/commit/dc6172efd3860ce95b40a7b3e93611f875f06d40) | +| 51 | [@yangpuPKU](https://github.com/yangpuPKU) | 2025/5/23 | [46df67a](https://github.com/vllm-project/vllm-ascend/commit/46df67a5e9ab73fade08cbb2d8c0155cee7316d1) | | 50 | [@wonderful199082](https://github.com/wonderful199082) | 2025/5/20 | [5cf9ff1](https://github.com/vllm-project/vllm-ascend/commit/5cf9ff18e91b0b7031c258d71a257b8e24689763) | | 49 | [@22dimensions](https://github.com/22dimensions) | 2025/5/17 | [a8730e7](https://github.com/vllm-project/vllm-ascend/commit/a8730e7a3c4ac6c4b39a5946c943252fdea6cce5) | | 48 | [@cxcxflying](https://github.com/cxcxflying) | 2025/5/13 | [e564470](https://github.com/vllm-project/vllm-ascend/commit/e56447033889ca95df512208cab22ef832bfdf07) | From e68e81f2ce9a585aa5edca7a001e6669f514723f Mon Sep 17 00:00:00 2001 From: zhangxinyuehfad <59153331+zhangxinyuehfad@users.noreply.github.com> Date: Tue, 10 Jun 2025 14:35:44 +0800 Subject: [PATCH 003/153] [CI] Make accuarcy CI and report work (#1078) ### What this PR does / why we need it? Make accuarcy CI and report work ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Manaully review Signed-off-by: hfadzxy --- .github/workflows/accuracy_report.yaml | 224 ++++++++++++------ .github/workflows/accuracy_test.yaml | 13 +- benchmarks/scripts/run_accuracy.py | 8 +- .../evaluation/accuracy_report/index.md | 6 + .../developer_guide/evaluation/index.md | 1 + 5 files changed, 170 insertions(+), 82 deletions(-) create mode 100644 docs/source/developer_guide/evaluation/accuracy_report/index.md diff --git a/.github/workflows/accuracy_report.yaml b/.github/workflows/accuracy_report.yaml index 1d03e13188..fe1dbd1034 100644 --- a/.github/workflows/accuracy_report.yaml +++ b/.github/workflows/accuracy_report.yaml @@ -19,110 +19,184 @@ name: Accuracy Report on: workflow_dispatch: inputs: - branch: - description: 'choose a dev branch to pr' + vllm-ascend-branch: + description: 'vllm-ascend branch:' required: true - vllm-ascend-version: - description: 'what vllm-ascend version to accuracy test?' + type: choice + options: + - main + - v0.7.3-dev + models: + description: 'models:' required: true - type: string + type: choice + options: + - all + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen3-8B-Base + default: 'all' + jobs: - download: + download_reports: runs-on: ubuntu-latest + strategy: + matrix: + model: ${{ fromJSON( + (github.event.inputs.models == 'all' && + '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || + (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' && + '["Qwen/Qwen2.5-7B-Instruct"]') || + (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && + '["Qwen/Qwen2.5-VL-7B-Instruct"]') || + (github.event.inputs.models == 'Qwen/Qwen3-8B-Base' && + '["Qwen/Qwen3-8B-Base"]') + ) }} + + version: [0, 1] + exclude: + - model: 'Qwen/Qwen2.5-VL-7B-Instruct' + version: 1 + fail-fast: false + + name: Download ${{ matrix.model }} V${{ matrix.version }} steps: - name: Checkout repository uses: actions/checkout@v4 with: - ref: ${{ github.event.inputs.branch }} - - - name: Debug List Artifacts - run: gh api /repos/${{ github.repository }}/actions/artifacts - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ref: ${{ github.event.inputs.vllm-ascend-branch }} - - name: Query artifact run id for Qwen2.5-VL-7B-Instruct V0 latest artifact - id: get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0 + - name: Get base model name + id: get_basename run: | - ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts") - RUN_ID=$(echo "$ARTIFACT_JSON" | \ - jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-VL-7B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id') - echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT" - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + model_base_name=$(basename "${{ matrix.model }}") + echo "model_base_name=$model_base_name" >> $GITHUB_OUTPUT + shell: bash - - name: Query artifact run id for Qwen2.5-7B-Instruct V0 latest artifact - id: get_Qwen2_5_7B_Instruct_latest_run_id_V0 + - name: Query artifact run id + id: get_run_id run: | - ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts") + ARTIFACT_PATTERN="${{ github.event.inputs.vllm-ascend-branch }}-${{ steps.get_basename.outputs.model_base_name }}-V${{ matrix.version }}-report" + echo "Querying artifacts with pattern: $ARTIFACT_PATTERN" + + ARTIFACT_JSON=$(gh api --paginate /repos/${{ github.repository }}/actions/artifacts || echo "{}") + RUN_ID=$(echo "$ARTIFACT_JSON" | \ - jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-7B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id') - echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT" + jq -s -r --arg pattern "$ARTIFACT_PATTERN" \ + '[.[].artifacts[]] | map(select(.name | test($pattern))) | sort_by(.created_at) | last | .workflow_run.id // empty') + + if [ -z "$RUN_ID" ]; then + echo "::warning::No artifact found matching pattern $ARTIFACT_PATTERN. Skipping download." + echo "runid=" >> $GITHUB_OUTPUT + else + echo "Found matching artifact with run ID: $RUN_ID" + echo "runid=$RUN_ID" >> $GITHUB_OUTPUT + fi env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Query artifact run id for Qwen3-8B-Base V0 latest artifact - id: get_Qwen3_8B_Base_latest_run_id_V0 - run: | - ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts") - RUN_ID=$(echo "$ARTIFACT_JSON" | \ - jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen3-8B-Base-V0-report")] | sort_by(.created_at) | last | .workflow_run.id') - echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT" - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Download Qwen/Qwen2.5-VL-7B-Instruct V0 Artifact + - name: Download Artifact + if: ${{ steps.get_run_id.outputs.runid != '' }} uses: actions/download-artifact@v4 with: - name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-VL-7B-Instruct-V0-report - path: ./docs/source/developer_guide/evaluation/accuracy_report - github-token: ${{ secrets.GITHUB_TOKEN }} - repository: vllm-project/vllm-ascend - run-id: ${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }} + name: ${{ github.event.inputs.vllm-ascend-branch }}-${{ steps.get_basename.outputs.model_base_name }}-V${{ matrix.version }}-report + path: ./docs/source/developer_guide/evaluation/accuracy_report_bak + github-token: ${{ secrets.GITHUB_TOKEN }} + repository: ${{ github.repository }} + run-id: ${{ steps.get_run_id.outputs.runid }} + + - name: Upload reports artifact + if: ${{ steps.get_run_id.outputs.runid != '' }} + uses: actions/upload-artifact@v4 + with: + name: report-${{ steps.get_basename.outputs.model_base_name }}-v${{ matrix.version }} + path: ./docs/source/developer_guide/evaluation/accuracy_report_bak/*.md + retention-days: 90 - - name: Download Qwen/Qwen2.5-7B-Instruct Artifact - uses: actions/download-artifact@v4 + create_pr: + runs-on: ubuntu-latest + needs: download_reports + steps: + - name: Checkout repository + uses: actions/checkout@v4 with: - name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-7B-Instruct-V0-report - path: ./docs/source/developer_guide/evaluation/accuracy_report - github-token: ${{ secrets.GITHUB_TOKEN }} - repository: vllm-project/vllm-ascend - run-id: ${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }} + ref: ${{ github.event.inputs.vllm-ascend-branch }} + + - name: Setup workspace + run: mkdir -p ./accuracy/accuracy_report - - name: Download Qwen/Qwen3-8B-Base Artifact + - name: Download only current run reports uses: actions/download-artifact@v4 with: - name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen3-8B-Base-V0-report path: ./docs/source/developer_guide/evaluation/accuracy_report + pattern: report-* github-token: ${{ secrets.GITHUB_TOKEN }} - repository: vllm-project/vllm-ascend - run-id: ${{ steps.get_Qwen3_8B_Base_latest_run_id_V0.outputs.runid }} + run-id: ${{ github.run_id }} + + - name: Delete old report + run: | + find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete + find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \; + find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete - - name: Display Files - working-directory: ./docs/source/developer_guide/evaluation/accuracy_report + - name: Generate step summary + if: ${{ always() }} run: | - cat ./Qwen2.5-VL-7B-Instruct.md - cat ./Qwen2.5-7B-Instruct.md - cat ./Qwen3-8B-Base.md - - - name: Create Pull Request for markdown update + for report in ./docs/source/developer_guide/evaluation/accuracy_report/*.md; do + filename=$(basename "$report") + # skip index.md + if [ "$filename" = "index.md" ]; then + continue + fi + + if [ -f "$report" ]; then + { + echo -e "\n\n---\n" + echo "## 📄 Report File: $(basename $report)" + cat "$report" + } >> "$GITHUB_STEP_SUMMARY" + fi + done + + - name: Update accuracy_report/index.md + run: | + REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report" + INDEX_MD="$REPORT_DIR/index.md" + + { + echo "# Accuracy Report" + echo "" + echo "::: {toctree}" + echo ":caption: Accuracy Report" + echo ":maxdepth: 1" + + for report in "$REPORT_DIR"/*.md; do + filename="$(basename "$report" .md)" + if [ "$filename" != "index" ]; then + echo "$filename" + fi + done + + echo ":::" + } > "$INDEX_MD" + + - name: Create Pull Request uses: peter-evans/create-pull-request@v7 with: token: ${{ secrets.PR_TOKEN }} - base: ${{ github.event.inputs.branch }} - branch: auto-pr/accuracy-test - commit-message: "Update accuracy report for ${{ github.event.inputs.branch }}" + base: ${{ github.event.inputs.vllm-ascend-branch }} + branch: auto-pr/accuracy-report + commit-message: "Update accuracy reports for ${{ github.event.inputs.vllm-ascend-branch }}" add-paths: ./docs/source/developer_guide/evaluation/accuracy_report/*.md - title: "[Doc]Update accuracy report for ${{ github.event.inputs.branch }}" + title: "[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-branch }}" body: | - The accuracy results running on Ascend NPU have changed, I'm updating the report. - Please review the changes. - + The accuracy results running on NPU Altlas A2 have changed, updating reports for: + ${{ + github.event.inputs.models == 'all' + && 'All models (Qwen2.5-7B-Instruct, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)' + || github.event.inputs.models + }} + - [Workflow run][1] - - [Qwen2.5-7B-Instruct accuracy report][2] - - [Qwen2.5-VL-7B-Instruct accuracy report][3] - - [Qwen3-8B-Base accuracy report][4] - - [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - [2]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }} - [3]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }} - [4]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen3_8B_Base_latest_run_id_V0.outputs.runid }} + + [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} \ No newline at end of file diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index d0a7debaae..d1a61230ef 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -96,7 +96,7 @@ jobs: # - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct model_name: ${{ fromJSON( (github.event.inputs.models == 'all' && - '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","model_name":"Qwen/Qwen3-8B-Base"]') || + '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' && '["Qwen/Qwen2.5-7B-Instruct"]') || (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && @@ -201,6 +201,7 @@ jobs: pip show torch | grep "Version:" | awk '{print "GHA_TORCH_VERSION="$2}' pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}' pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//' + echo "GHA_VLLM_ASCEND_VERSION=${{ github.event.inputs.vllm-ascend-version || github.ref }}" } >> "$GITHUB_ENV" - name: Print versions @@ -209,7 +210,7 @@ jobs: echo "Torch NPU: ${{ env.GHA_TORCH_NPU_VERSION }}" echo "Torch: ${{ env.GHA_TORCH_VERSION }}" echo "vLLM: ${{ env.GHA_VLLM_VERSION }}" - echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}" + echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION }}" - name: Run Accuracy Test for V${{ matrix.vllm_use_version }} id: report @@ -238,10 +239,16 @@ jobs: run: | cat ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md >> $GITHUB_STEP_SUMMARY + - name: Sanitize version string for artifact naming + run: | + SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}" + echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV" + - name: Upload Report for V${{ matrix.vllm_use_version }} + if: ${{ github.event_name == 'workflow_dispatch' }} uses: actions/upload-artifact@v4 with: - name: "${{ env.GHA_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}-report" + name: "${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}-report" path: ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md if-no-files-found: warn retention-days: 90 diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py index f508feda73..79c58bcbc2 100644 --- a/benchmarks/scripts/run_accuracy.py +++ b/benchmarks/scripts/run_accuracy.py @@ -26,7 +26,7 @@ import lm_eval import torch -UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B"] +UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B-Base"] UNIMODAL_TASK = ["ceval-valid", "gsm8k"] MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"] MULTIMODAL_TASK = ["mmmu_val"] @@ -36,17 +36,17 @@ MODEL_RUN_INFO = { "Qwen/Qwen2.5-7B-Instruct": ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" - "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n" + "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen3-8B-Base": ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" - "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n" + "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen2.5-VL-7B-Instruct": ("export MODEL_ARGS='pretrained={model}, max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2'\n" - "lm_eval --model vllm-vlm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n" + "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --batch_size 1"), } diff --git a/docs/source/developer_guide/evaluation/accuracy_report/index.md b/docs/source/developer_guide/evaluation/accuracy_report/index.md new file mode 100644 index 0000000000..51fae4886f --- /dev/null +++ b/docs/source/developer_guide/evaluation/accuracy_report/index.md @@ -0,0 +1,6 @@ +# Accuracy Report + +:::{toctree} +:caption: Accuracy Report +:maxdepth: 1 +::: \ No newline at end of file diff --git a/docs/source/developer_guide/evaluation/index.md b/docs/source/developer_guide/evaluation/index.md index 324c2e2f2c..ea861abb19 100644 --- a/docs/source/developer_guide/evaluation/index.md +++ b/docs/source/developer_guide/evaluation/index.md @@ -6,6 +6,7 @@ using_lm_eval using_opencompass using_evalscope +accuracy_report/index ::: :::{toctree} From b75cb788dd8e2058fec79b2bbffff277ee5f12d2 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 10 Jun 2025 17:14:25 +0800 Subject: [PATCH 004/153] [Bugfix] add compilation/__init__.py to fix import error (#1152) 1. Add `__init__.py` for vllm_ascend/compilation to make sure it's a python module 2. Fix model runner bug to keep the same with vllm 3. Add release note for 0.9.0rc2 --------- Signed-off-by: wangxiyuan --- docs/source/conf.py | 4 ++-- docs/source/developer_guide/versioning_policy.md | 3 +++ docs/source/faqs.md | 10 ++++------ docs/source/user_guide/graph_mode.md | 2 +- docs/source/user_guide/release_notes.md | 8 ++++++++ vllm_ascend/compilation/__init__.py | 0 vllm_ascend/worker/model_runner_v1.py | 6 +++++- 7 files changed, 23 insertions(+), 10 deletions(-) create mode 100644 vllm_ascend/compilation/__init__.py diff --git a/docs/source/conf.py b/docs/source/conf.py index b1e33dc1b1..4b8eb0eaaa 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -68,10 +68,10 @@ # the branch of vllm-ascend, used in vllm-ascend clone and image tag # - main branch: 'main' # - vX.Y.Z branch: latest vllm-ascend release tag - 'vllm_ascend_version': 'v0.9.0rc1', + 'vllm_ascend_version': 'v0.9.0rc2', # the newest release version of vllm-ascend and matched vLLM, used in pip install. # This value should be updated when cut down release. - 'pip_vllm_ascend_version': "0.9.0rc1", + 'pip_vllm_ascend_version': "0.9.0rc2", 'pip_vllm_version': "0.9.0", # CANN image tag 'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10", diff --git a/docs/source/developer_guide/versioning_policy.md b/docs/source/developer_guide/versioning_policy.md index bbb08c3bee..1533b0af5a 100644 --- a/docs/source/developer_guide/versioning_policy.md +++ b/docs/source/developer_guide/versioning_policy.md @@ -22,6 +22,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin: | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | MindIE Turbo | |-------------|--------------|------------------|-------------|--------------------|--------------| +| v0.9.0rc2 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.9.0rc1 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.8.5rc1 | v0.8.5.post1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.8.4rc2 | v0.8.4 | >= 3.9, < 3.12 | 8.0.0 | 2.5.1 / 2.5.1 | | @@ -34,6 +35,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin: | Date | Event | |------------|-------------------------------------------| +| 2025.06.10 | Release candidates, v0.9.0rc2 | | 2025.06.09 | Release candidates, v0.9.0rc1 | | 2025.05.29 | v0.7.x post release, v0.7.3.post1 | | 2025.05.08 | v0.7.x Final release, v0.7.3 | @@ -71,6 +73,7 @@ Usually, each minor version of vLLM (such as 0.7) will correspond to a vLLM Asce | Branch | Status | Note | |------------|--------------|--------------------------------------| | main | Maintained | CI commitment for vLLM main branch and vLLM 0.9.x branch | +| v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.0 and 0.9.1 version | | v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version | | v0.7.1-dev | Unmaintained | Replaced by v0.7.3-dev | diff --git a/docs/source/faqs.md b/docs/source/faqs.md index 8c840be457..1d355b5e9a 100644 --- a/docs/source/faqs.md +++ b/docs/source/faqs.md @@ -3,7 +3,7 @@ ## Version Specific FAQs - [[v0.7.3.post1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1007) -- [[v0.9.0rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1115) +- [[v0.9.0rc2] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1115) ## General FAQs @@ -69,14 +69,14 @@ If all above steps are not working, feel free to submit a GitHub issue. ### 7. How does vllm-ascend perform? -Currently, only some models are improved. Such as `Qwen2 VL`, `Deepseek V3`. Others are not good enough. In the future, we will support graph mode and custom ops to improve the performance of vllm-ascend. And when the official release of vllm-ascend is released, you can install `mindie-turbo` with `vllm-ascend` to speed up the inference as well. +Currently, only some models are improved. Such as `Qwen2 VL`, `Deepseek V3`. Others are not good enough. From 0.9.0rc2, Qwen and Deepseek works with graph mode to play a good performance. What's more, you can install `mindie-turbo` with `vllm-ascend v0.7.3` to speed up the inference as well. ### 8. How vllm-ascend work with vllm? vllm-ascend is a plugin for vllm. Basically, the version of vllm-ascend is the same as the version of vllm. For example, if you use vllm 0.7.3, you should use vllm-ascend 0.7.3 as well. For main branch, we will make sure `vllm-ascend` and `vllm` are compatible by each commit. ### 9. Does vllm-ascend support Prefill Disaggregation feature? -Currently, only 1P1D is supported by vllm. For vllm-ascend, it'll be done by [this PR](https://github.com/vllm-project/vllm-ascend/pull/432). For NPND, vllm is not stable and fully supported yet. We will make it stable and supported by vllm-ascend in the future. +Currently, only 1P1D is supported on V0 Engine. For V1 Engine or NPND support, We will make it stable and supported by vllm-ascend in the future. ### 10. Does vllm-ascend support quantization method? @@ -84,9 +84,7 @@ Currently, w8a8 quantization is already supported by vllm-ascend originally on v ### 11. How to run w8a8 DeepSeek model? -Currently, w8a8 DeepSeek is working in process: [support AscendW8A8 quantization](https://github.com/vllm-project/vllm-ascend/pull/511) - -Please run DeepSeek with BF16 now, following the [Multi-Node DeepSeek inferencing tutorail](https://vllm-ascend.readthedocs.io/en/main/tutorials/multi_node.html) +Please following the [quantization inferencing tutorail](https://vllm-ascend.readthedocs.io/en/main/tutorials/multi_npu_quantization.html) and replace model to DeepSeek. ### 12. There is not output in log when loading models using vllm-ascend, How to solve it? diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md index 2bd83ffe75..126978f95b 100644 --- a/docs/source/user_guide/graph_mode.md +++ b/docs/source/user_guide/graph_mode.md @@ -54,7 +54,7 @@ outputs = model.generate("Hello, how are you?") online example: ```shell -vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enable": True}}' +vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enable": true}}' ``` You can find more detail about additional config [here](./additional_config.md) diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md index 42a944f449..8f72b937ee 100644 --- a/docs/source/user_guide/release_notes.md +++ b/docs/source/user_guide/release_notes.md @@ -1,5 +1,13 @@ # Release note +## v0.9.0rc2 - 2025.06.10 + +This release contains some quick fixes for v0.9.0rc1. Please use this release instead of v0.9.0rc1. + +### Highlights + +- Fix the import error when vllm-ascend is installed without editable way. [#1152](https://github.com/vllm-project/vllm-ascend/pull/1152) + ## v0.9.0rc1 - 2025.06.09 This is the 1st release candidate of v0.9.0 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. From this release, V1 Engine is recommended to use. The code of V0 Engine is frozen and will not be maintained any more. Please set environment `VLLM_USE_V1=1` to enable V1 Engine. diff --git a/vllm_ascend/compilation/__init__.py b/vllm_ascend/compilation/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 03be38f31f..2b343d7f7d 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -496,7 +496,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Update the block IDs. if not req_data.resumed_from_preemption: # Append the new blocks to the existing block IDs. - req_state.block_ids.extend(req_data.new_block_ids) + for block_ids, new_block_ids in zip( # type: ignore[call-overload] + req_state.block_ids, + req_data.new_block_ids, + strict=True): + block_ids.extend(new_block_ids) else: # The request is resumed from preemption. # Replace the existing block IDs with the new ones. From 95414bae7046eb4bf2323f5f21473c3b1cd158af Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Tue, 10 Jun 2025 17:18:09 +0800 Subject: [PATCH 005/153] [CI] Run e2e after pre check pass (#1132) Make sure the lint test passed before start the e2e test to save compute resource. Updated the patch doc to make sure the CI works as expect. Signed-off-by: wangxiyuan --- .github/workflows/codespell.yml | 47 ---------- .github/workflows/mypy.yaml | 68 -------------- .github/workflows/ruff.yml | 48 ---------- .github/workflows/vllm_ascend_test.yaml | 65 +++++++++++++- .github/workflows/yapf.yml | 48 ---------- vllm_ascend/patch/__init__.py | 112 ++++++++++++++---------- 6 files changed, 128 insertions(+), 260 deletions(-) delete mode 100644 .github/workflows/codespell.yml delete mode 100644 .github/workflows/mypy.yaml delete mode 100644 .github/workflows/ruff.yml delete mode 100644 .github/workflows/yapf.yml diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml deleted file mode 100644 index a239d38faf..0000000000 --- a/.github/workflows/codespell.yml +++ /dev/null @@ -1,47 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: codespell - -on: - pull_request: - branches: - - 'main' - - '*-dev' - -jobs: - codespell: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Run codespell check - run: | - CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**') - CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn') - - codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}" diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml deleted file mode 100644 index 81cd69afee..0000000000 --- a/.github/workflows/mypy.yaml +++ /dev/null @@ -1,68 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: mypy - -on: - pull_request: - branches: - - 'main' - - '*-dev' - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - paths: - - '**/*.py' - - '.github/workflows/mypy.yaml' - - 'tools/mypy.sh' - - 'mypy.ini' - -jobs: - mypy: - runs-on: ubuntu-latest - strategy: - matrix: - # TODO(yikun): Add 3.12 back when torch-npu support 3.12 - python-version: ["3.9", "3.10", "3.11"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu - - - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v4 - with: - repository: vllm-project/vllm - path: vllm-empty - - - name: Install vllm-project/vllm from source - working-directory: vllm-empty - run: | - pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu - VLLM_TARGET_DEVICE=empty pip install . - - - name: Mypy - run: | - echo "::add-matcher::.github/workflows/matchers/mypy.json" - tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml deleted file mode 100644 index a19e6a89dc..0000000000 --- a/.github/workflows/ruff.yml +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: ruff - -on: - pull_request: - branches: - - 'main' - - '*-dev' - -jobs: - ruff: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Analysing the code with ruff - run: | - echo "::add-matcher::.github/workflows/matchers/ruff.json" - ruff check --output-format github . - - name: Run isort - run: | - isort . --check-only diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 13343289ad..32524eecac 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # -name: 'e2e test' +name: 'test' on: schedule: @@ -31,6 +31,9 @@ on: - '!docs/**' - 'pytest.ini' - '!benchmarks/**' + - 'tools/mypy.sh' + - 'mypy.ini' + # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. # It's used to activate ascend-toolkit environment variables. @@ -39,7 +42,65 @@ defaults: shell: bash -el {0} jobs: - test: + lint: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Run codespell check + run: | + CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**') + CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn') + + codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}" + - name: Analysing the code with ruff + run: | + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . + - name: Run isort + run: | + isort . --check-only + - name: Running yapf + run: | + python -m pip install --upgrade pip + pip install toml + pip install yapf==0.32.0 + yapf --diff --recursive . + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: vllm-empty + run: | + pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu + VLLM_TARGET_DEVICE=empty pip install . + + - name: Mypy Check + run: | + echo "::add-matcher::.github/workflows/matchers/mypy.json" + tools/mypy.sh 1 ${{ matrix.python-version }} + + e2e: + needs: [lint] + if: ${{ needs.lint.result == 'success' }} strategy: max-parallel: 2 matrix: diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml deleted file mode 100644 index 64497d1563..0000000000 --- a/.github/workflows/yapf.yml +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: yapf - -on: - pull_request: - branches: - - 'main' - - '*-dev' - paths: - - "**/*.py" - - .github/workflows/yapf.yml - -jobs: - yapf: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install toml - pip install yapf==0.32.0 - - name: Running yapf - run: | - yapf --diff --recursive . diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index ccf9bd9e0c..4be92c23a4 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -35,6 +35,17 @@ # -------------------------------- # * Platform Patch: # ================= +# ** File: platform/patch_0_9_0/patch_distributed.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.distributed.utils.stateless_init_torch_distributed_process_group()` +# Why: +# vllm distributed use gloo backend by default to initialize stateless process group, but we want to use hccl here +# How: +# Add hccl backend to the `stateless_init_torch_distributed_process_group` +# Related PR (if no, explain why): +# https://github.com/vllm-project/vllm/pull/18763 +# Future Plan: +# Remove this patch once vllm is upgraded to 0.9.1 # ** File: platform/patch_common/patch_distributed.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.distributed.parallel_state.destroy_model_parallel()` @@ -44,56 +55,40 @@ # platform owned `CoordinatorGroup` to make sure all the CoordinateGroup can be properly destroyed # How: # Call `vllm_ascend.distributed.parallel_state method `destroy_platform_model_parallel` to destroy all the `CoordinateGroup` -# Related PR (if no, explain why): no related PR, we want add this ability into vllm +# Related PR (if no, explain why): # Future Plan: # Remove those patch when vllm merged them -# 2. `vllm.distributed.stateless_init_torch_distributed_process_group()` +# 2. `vllm.v1.engine.core.DPEngineCoreProc._init_data_parallel` # Why: -# The stateless process group can not be initialized except from gloo and nccl backend, vllm-ascend -# needs to initialize its own stateless process group for communication, so we add the platform related -# call to the `stateless_init_torch_distributed_process_group`, to enable other platform which may support -# stateless process group initialize method +# There is some bug for ASCEND_RT_VISIBLE_DEVICES usage. # How: -# rewrite stateless_init_torch_distributed_process_group to judge if there is a stateless process group initialize -# method and call platform method `platform_register_backend` to initialize them -# Related PR (if no, explain why): no related PR, we want add this ability into vllm +# The ASCEND_RT_VISIBLE_DEVICES related code is dropped. +# Related PR (if no, explain why): +# No, this is a bug for vllm ascend # Future Plan: -# Remove those patch when vllm merged them -# 3. `ParallelConfig.get_next_dp_init_port` +# Remove this patch once ASCEND_RT_VISIBLE_DEVICES bug is fixed. +# 3. `vllm.config.ParallelConfig.get_next_dp_init_port` # Why: -# We want to get dp port from env variable, so the multi-node inference can be properly initialized and run. +# vllm doesn't support get port from environment. # How: -# Get the dp port from env variable enable multi-mode dp inference -# Related PR (if no, explain why): no related PR, we want add this ability into vllm +# Add the logic to get port from environment. +# Related PR (if no, explain why): +# Need a PR to vllm to support get port from environment. # Future Plan: -# Its a workaround in vllm-ascend to enable multi-node dp inference, maybe removed if vllm have better plan -# on multi-node dp inference implementation -# 4. `ParallelConfig.stateless_init_dp_group` +# Remove those patch when vllm merged them +# 4. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group` # Why: # vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to # get better performance # How: -# adopt nccl backend to init process group -# Related PR (if no, explain why): no related PR, we want add this ability into vllm +# adopt nccl backend to init process group.(Now we still use gloo, it's just a placeholder, we'll use nccl in the future) +# Related PR (if no, explain why): +# Need a PR to vllm to support more backend. # Future Plan: -# Remove those patch when vllm merged them -# +# Remove those patch when vllm support more backend. # # * Worker Patch: # =============== -# ** File: worker/patch_common/patch_metrics.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.spec_decode.metrics.AsyncMetricsCollector.maybe_collect_rejsample_metrics` -# Why: -# There are cuda hard code (current_platform.is_cuda_alike()) in -# `AsyncMetricsCollector.maybe_collect_rejsample_metrics` -# How: -# Change to use `current_platform.Event` to determine whether to return None -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... -# https://github.com/vllm-project/vllm/pull/14411 -# Future Plan: -# Revert it when the related pr is merged in vllm. -# # ** File: worker/patch_common/patch_minicpm.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.minicpm.MiniCPMAttention.forward` @@ -103,7 +98,7 @@ # However float32 is not supported in cann rope op, thus we keep this patch # How: # Removed the dtype convert operations in forward -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # NO, only for npu due to rope op. # Future Plan: # Keep this patch in vllm-ascend. @@ -119,7 +114,7 @@ # - support attention metadata register to the set supported spec decode # - offer a api in platform to determine whether spec decode is supported, # and deprecate is_cuda_alike in it. -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm/pull/15195 # - https://github.com/vllm-project/vllm-ascend/pull/395 # Future Plan: @@ -131,14 +126,14 @@ # vLLM `Remove Sampler from Model Code` so vllm-ascend needs adapt to this change. # How: # Use vLLM 0.8.4 method to patch it. -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm/pull/15195 # - https://github.com/vllm-project/vllm-ascend/pull/395 # Future Plan: # Remove it when we identify the reasons clearly. # # ** File: worker/patch_common/patch_spec_decode_worker.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker` # Why: # We need to use the patched `TP1DraftModelRunner` in `SpecDecodeWorker.create_worker`. @@ -146,14 +141,14 @@ # `FlashAttentionMetadata` # How: # ditto -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm/pull/15195 # - https://github.com/vllm-project/vllm-ascend/pull/395 # Future Plan: # Revert it when the related pr is merged in vllm and vllm-ascend. # # ** File: worker/patch_common/patch_eagle.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.spec_decode.eagle.prepare_inputs` # Why: # We need to use the patched `prepare_input_kernel` in `eagle.prepare_inputs`. @@ -161,12 +156,12 @@ # kernel, ascend is now not support triton kernel. # How: # Re-implementation the `prepare_input_kernel` triton kernel by pytorch -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - Ascend doesn't support triton # Future Plan: # Revert it when the ascend support triton kernel. # -# ** File: v1/sample/sampler.py ** +# ** File: worker/patch_common/patch_sampler.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.sample.sampler.Sampler.apply_top_k_top_p` # Why: @@ -175,21 +170,44 @@ # to improve performance. # How: # Re-implementation the `apply_top_k_top_p` function by pytorch -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm-ascend/pull/970 # Future Plan: # Revert it when the ascend scatter performance improves. # -# ** File: v1/sample/sampler.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~s -# 1. `vllm.v1.sample.sampler.Sampler.apply_min_p` +# 2. `vllm.v1.sample.sampler.Sampler.apply_min_p` # Why: # We need to use the patched `apply_min_p` in `sample`. # The mainly reason to overwrite `apply_min_p` is # to improve performance. # How: # Re-implementation the `apply_min_p` function by pytorch -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm-ascend/pull/970 # Future Plan: # Revert it when the ascend indexput performance improves. +# +# ** File: worker/patch_common/patch_distributed.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.distributed.parallel_state.GroupCoordinator` +# Why: +# vllm doesn't support all_to_all for GroupCoordinator. +# How: +# Add all_to_all implementation for GroupCoordinator. +# Related PR (if no, explain why): +# Need a PR to vllm to support all_to_all for GroupCoordinator. +# Future Plan: +# Remove this patch when vllm merged them. +# +# ** File: worker/patch_common/patch_utils.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.utils.direct_register_custom_op` +# Why: +# pytorch 2.7.o is not compatible with pytorch 2.5.1. While vllm is based on pytorch 2.7.0, but vllm ascend +# is based on pytorch 2.5.1, so we need to use this patch to make vllm compatible with pytorch 2.5.1. +# How: +# patch __annotations__ check to make it compatible with pytorch 2.5.1. +# Related PR (if no, explain why): +# This is the problem in vllm-ascend +# Future Plan: +# Remove this patch once pytorch 2.7.0 is supported for vllm ascend. From 2a9432199214780976cc4493fb94f0e8a21b28ba Mon Sep 17 00:00:00 2001 From: NPUDev Date: Tue, 10 Jun 2025 20:36:06 +0800 Subject: [PATCH 006/153] add eplb policy --- vllm_ascend/eplb/core/policy/__init__.py | 0 vllm_ascend/eplb/core/policy/eplb_policy.py | 42 +++++++++++++++++++ .../eplb/core/policy/mock_load_balance.py | 29 +++++++++++++ .../eplb/core/policy/policy_factory.py | 15 +++++++ 4 files changed, 86 insertions(+) create mode 100644 vllm_ascend/eplb/core/policy/__init__.py create mode 100644 vllm_ascend/eplb/core/policy/eplb_policy.py create mode 100644 vllm_ascend/eplb/core/policy/mock_load_balance.py create mode 100644 vllm_ascend/eplb/core/policy/policy_factory.py diff --git a/vllm_ascend/eplb/core/policy/__init__.py b/vllm_ascend/eplb/core/policy/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/vllm_ascend/eplb/core/policy/eplb_policy.py b/vllm_ascend/eplb/core/policy/eplb_policy.py new file mode 100644 index 0000000000..1de60c348d --- /dev/null +++ b/vllm_ascend/eplb/core/policy/eplb_policy.py @@ -0,0 +1,42 @@ +# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +from abc import abstractmethod + + +class DynamicConfig: + placement_policy = None + + max_transferred_expert_per_layer = 100 + # 一台机器上,一层最多搬运多少专家 + + ep_worldsize = 64 # 整个集群上所有的专家分布在多少个die上 + num_die_per_host = 8 # 每台机器上有几个die + + +class EplbPolicy: + def __init__(self, config: DynamicConfig): + self.config = config + + @abstractmethod + def rebalance_experts(self, current_expert_table, expert_workload): + """ + 传入weight并返回相关限制条件下的专家复制和放置 + INPUT: + current_expert_table: [layerId, rankId, expert_num_i] + expert_workload = expert_table[layer0][rankId][expert_num_i] + + RETURNED: (res, expert_table) + res: + 1 -- table_changed + 0 -- not_changed + + expert_table: [layerId, rankId, expert_num_i] + expert_num_i --- [0, MaxExpertPerRank] + expertID = expert_table[layer0][rankId][expert_num_i] + array_values: + [0, 1, 2, 3, 248] + [4, 5, 6, 7, 254] + [8, 9, 10, 11, 71] + ... + [252, 253, 254, 255, 0] + """ + pass diff --git a/vllm_ascend/eplb/core/policy/mock_load_balance.py b/vllm_ascend/eplb/core/policy/mock_load_balance.py new file mode 100644 index 0000000000..0ba2ca21ba --- /dev/null +++ b/vllm_ascend/eplb/core/policy/mock_load_balance.py @@ -0,0 +1,29 @@ +# Copyright # Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +import copy +import random +import torch +import torch + +from .eplb_policy import EplbPolicy, DynamicConfig + +random.seed(42) + +class MockLoadBalance(EplbPolicy): + def __init__(self, config: DynamicConfig): + super().__init__(config) + + def rebalance_experts(self, current_expert_table, expert_workload): + new_table = copy.deepcopy(current_expert_table) + num_layers = len(current_expert_table) + num_card = len(current_expert_table[0]) + + for i in range(num_layers): + # 随机选两个卡 + indices = random.sample(range(num_card), 2) + + # 交换冗余专家 + expert_id_to_exchange = new_table[i][indices[0]][-1].clone() + new_table[i][indices[0]][-1] = new_table[i][indices[1]][-1] + new_table[i][indices[1]][-1] = expert_id_to_exchange + + return 1, [-i for i in range(num_layers)], new_table \ No newline at end of file diff --git a/vllm_ascend/eplb/core/policy/policy_factory.py b/vllm_ascend/eplb/core/policy/policy_factory.py new file mode 100644 index 0000000000..d06d1ce634 --- /dev/null +++ b/vllm_ascend/eplb/core/policy/policy_factory.py @@ -0,0 +1,15 @@ +# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +from .eplb_policy import EplbPolicy, DynamicConfig +from .mock_load_balance import MockLoadBalance +from .mock_dynamic_ep import DynamicEP + + + +class PolicyFactory: + @staticmethod + def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy: + policy = { + 0:MockLoadBalance , # MockLoadBalance + 1:DynamicEP, # DynamicEP + } + return policy.get(policy_type, MockLoadBalance)(config) From e91956f49ad43b050069d8be946017d1ecaf62ce Mon Sep 17 00:00:00 2001 From: NPUDev Date: Tue, 10 Jun 2025 20:36:15 +0800 Subject: [PATCH 007/153] add eplb updator --- vllm_ascend/eplb/eplb_updator.py | 148 +++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 vllm_ascend/eplb/eplb_updator.py diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py new file mode 100644 index 0000000000..99cd42255b --- /dev/null +++ b/vllm_ascend/eplb/eplb_updator.py @@ -0,0 +1,148 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +import torch +import torch.distributed as dist +from multiprocessing import Queue, Manager + +from vllm.logger import logger +from vllm_ascend.eplb.core.worker.eplb_worker import EplbProcess +from vllm_ascend.eplb.core.loader.device_transfer_loader import D2DExpertWeightLoader + +class EplbUpdator: + + def __init__(self): + self.init_eplb() + + def set_adaptor(self, adaptor): + self.adaptor = adaptor + self.eplb_loader = D2DExpertWeightLoader(eplb_adaptor=self.adaptor) + + def init_eplb(self): + + self.num_iterations: torch.int64 = 10 + + self.num_moe_layers = 2 + self.weight_update_counter = 0 + self.expert_map_initialized = False + self.update_in_flight = False + + self.reqs = [] + + self.cur_iterations: torch.int64 = 0 + + self.planner_block_queue = Queue() + self.block_update_queue = Queue(maxsize=1) + + self.manager = Manager() + self.shared_dict = self.manager.dict({ + # 当前rank_id的专家表[num_layers,num_experts] + "expert_map": None, + # 热度负载信息 [num_layers, world_size, num_experts] + "moe_load": None, + # 所有的专家表[num_layers, world_size, num_experts] + "expert_maps": None + }) + + self.eplb = EplbProcess( + shared_dict = self.shared_dict, + planner_q = self.planner_block_queue, + block_update_q = self.block_update_queue, + policy_type = 0, + enable_d2d = True + ) + + self.eplb_process = self.eplb._launch_process() + + logger.info(f"[ModelRunner] Launched EPLB process (pid={self.eplb_process.pid})") + + + def get_update_iteration(self): + self.cur_iterations = self.cur_iterations + 1 + return self.cur_iterations % self.num_iterations == 0 + + def get_init_expert_map(self): + try: + if not self.expert_map_initialized: + self.shared_dict["expert_maps"] = self.adaptor.get_init_expert_map(self.num_moe_layers) + self.expert_map_initialized = True + except Exception as e: + logger.warning(f"[ModelRunner] Failed to wake EPLB process: {e}", exc_info=True) + + def wakeup_eplb_worker(self): + self.planner_block_queue.put(1) + + def forward_before(self): + if self.update_in_flight and self.weight_update_counter < self.num_moe_layers: + (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.block_update_queue.get() + rank_id = torch.distributed.get_rank() + expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] + expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] + # TODO: layer_id + 3 should be replaced by configuration + self.eplb_loader.generate_expert_d2d_transfer_task(expert_send_info_this_rank, + expert_recv_info_this_rank, updated_expert_map[rank_id], layer_id + 3) + self.weight_update_counter += 1 + if self.weight_update_counter == self.num_moe_layers: + self.update_in_flight = False + + # set asynchronous stream for d2d expert weight update + self.reqs = [] + self.eplb_loader.asyn_expert_weight_transfer(self.reqs) + + def forward_end(self): + + self.get_init_expert_map() + + if not self.update_in_flight and self.get_update_iteration(): + moe_load = self.compute_and_set_moe_load() + self.wakeup_eplb_worker() + self.update_in_flight = True + + self.eplb_loader.update_expert_map_and_weight(self.reqs) + + def compute_and_set_moe_load(self): + local_load = self.adaptor.get_rank_expert_workload(self.num_moe_layers) + + self._gather_buffer = None + if dist.is_initialized(): + self.world_size = dist.get_world_size() + if dist.is_initialized(): + device = local_load.device + if self._gather_buffer is None: + shape = (self.world_size, *local_load.shape) + self._gather_buffer = torch.empty(shape, + dtype=local_load.dtype, + device=device) + + dist.all_gather_into_tensor(self._gather_buffer, local_load) + + moe_load = self._gather_buffer.permute(1, 0, 2).contiguous() + self.shared_dict["moe_load"] = moe_load.cpu() + logger.debug(f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}") + else: + moe_load = local_load.unsqueeze(1) + self.shared_dict["moe_load"] = moe_load.cpu() + logger.debug(f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}") + return moe_load + + def shutdown(self): + """ + Clean up the EPLB process. + """ + if self.eplb_process.is_alive(): + self.eplb_process.terminate() + self.eplb_process.join() + logger.info("[ModelRunner] EPLB process terminated") From 66b3d2e62c0c2c9f8f57a17069c90ea2ee33831d Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Tue, 10 Jun 2025 20:37:15 +0800 Subject: [PATCH 008/153] implementation of VllmEplbAdaptor and D2DExpertWeightLoader --- vllm_ascend/eplb/adaptor/abstract_adaptor.py | 39 +++++ vllm_ascend/eplb/adaptor/vllm_adaptor.py | 90 +++++++++++ .../eplb/core/loader/abstract_loader.py | 24 +++ .../core/loader/device_transfer_loader.py | 141 ++++++++++++++++++ 4 files changed, 294 insertions(+) create mode 100644 vllm_ascend/eplb/adaptor/abstract_adaptor.py create mode 100644 vllm_ascend/eplb/adaptor/vllm_adaptor.py create mode 100644 vllm_ascend/eplb/core/loader/abstract_loader.py create mode 100644 vllm_ascend/eplb/core/loader/device_transfer_loader.py diff --git a/vllm_ascend/eplb/adaptor/abstract_adaptor.py b/vllm_ascend/eplb/adaptor/abstract_adaptor.py new file mode 100644 index 0000000000..8513b69ea0 --- /dev/null +++ b/vllm_ascend/eplb/adaptor/abstract_adaptor.py @@ -0,0 +1,39 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from abc import ABC, abstractmethod + +class EplbAdaptor(): + + def __init__(self, **args): + pass + + @abstractmethod + def get_rank_expert_workload(self, num_moe_layers): + raise NotImplementedError + + @abstractmethod + def get_init_expert_map(self): + raise NotImplementedError + + @abstractmethod + def do_update_expert_map(self): + raise NotImplementedError + + @abstractmethod + def do_update_expert_weight(self): + raise NotImplementedError \ No newline at end of file diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py new file mode 100644 index 0000000000..f4241a01fa --- /dev/null +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -0,0 +1,90 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import torch +import torch.distributed as dist + +from vllm_ascend.eplb.adaptor.abstract_adaptor import EplbAdaptor +from vllm.logger import logger + +class VllmEplbAdaptor(EplbAdaptor): + + def __init__(self, model, **args): + super().__init__(**args) + self.model = model + self.param_dict = dict(self.model.named_parameters()) + + # TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 is supported here + self.expert_weight_names = ["w13_weight", "w2_weight", "w13_weight_scale", "w13_weight_offset", + "w2_weight_scale", "w2_weight_offset"] + + self.buffer_tensor_dict = dict() + num_buffer_tensor = 100 # TO DO: provide number of buffer tensor by vllm configuration + params_dtype = torch.int8 # TO DO: provide number of buffer tensor by vllm configuration + self.init_buffer_tensor_dict(num_buffer_tensor, params_dtype) + + self.expert_map_per_layer = dict() + num_moe_layers = 2 # TO DO: provide number of num_moe_layers by vllm configuration + for layer_idx in range(num_moe_layers): + self.expert_map_per_layer[3 + layer_idx] = self.model.get_expert_map(3 + layer_idx) + + def init_buffer_tensor_dict(self, num_buffer_tensor, params_dtype): + for name in self.expert_weight_names: + complete_name = "model.layers.3.mlp.experts." + name + expert_tensor = self.param_dict[complete_name].data[0:num_buffer_tensor] + self.buffer_tensor_dict[name] = torch.empty_like(expert_tensor) + + def get_buffer_tensor(self, buffer_tensor_id): + for name in self.expert_weight_names: + yield self.buffer_tensor_dict[name][buffer_tensor_id] + + def get_expert_tensor(self, layer_id, global_expert_id_to_send): + for name in self.expert_weight_names: + complete_name = "model.layers." + str(layer_id) + ".mlp.experts." + name + local_expert_id = self.expert_map_per_layer[layer_id][global_expert_id_to_send].item() + yield self.param_dict[complete_name].data[local_expert_id] + + def get_rank_expert_workload(self, num_moe_layers): + return self.model.get_all_moe_loads(num_moe_layers) + + def get_init_expert_map(self, num_moe_layers): + expert_map = self.model.get_all_expert_map(num_moe_layers) + if dist.is_initialized(): + world_size = dist.get_world_size() + + rank = dist.get_rank() + + tensor_list = [ + torch.zeros_like(expert_map) for _ in range(world_size) + ] + + dist.all_gather(tensor_list, expert_map) + gathered = torch.stack(tensor_list, dim=0) + all_maps = gathered.permute(1, 0, 2).contiguous() + + all_expert_maps = all_maps.to(torch.device("cpu")) + return all_expert_maps + + def do_update_expert_map(self, layer_id, updated_expert_map): + self.expert_map_per_layer[layer_id].copy_(updated_expert_map) + + def do_update_expert_weight(self, layer_id, expert_id_before_replace, buffer_tensor_id): + for name in self.expert_weight_names: + complete_name = "model.layers." + str(layer_id) + ".mlp.experts." + name + local_expert_id = self.expert_map_per_layer[layer_id][expert_id_before_replace].item() + expert_tensor = self.param_dict[complete_name].data[local_expert_id] + expert_tensor.copy_(self.buffer_tensor_dict[name][buffer_tensor_id]) \ No newline at end of file diff --git a/vllm_ascend/eplb/core/loader/abstract_loader.py b/vllm_ascend/eplb/core/loader/abstract_loader.py new file mode 100644 index 0000000000..b1bef11c5d --- /dev/null +++ b/vllm_ascend/eplb/core/loader/abstract_loader.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from abc import ABC, abstractmethod + +class ExpertWeightLoader: + + @abstractmethod + def load_impl(self, old_expert_table, new_expert_table): + raise NotImplementedError \ No newline at end of file diff --git a/vllm_ascend/eplb/core/loader/device_transfer_loader.py b/vllm_ascend/eplb/core/loader/device_transfer_loader.py new file mode 100644 index 0000000000..ea48798670 --- /dev/null +++ b/vllm_ascend/eplb/core/loader/device_transfer_loader.py @@ -0,0 +1,141 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +import torch +import torch.distributed as dist +from enum import Enum + +from vllm.logger import logger +from vllm_ascend.eplb.core.loader.abstract_loader import ExpertWeightLoader + +class ExpertWeightUpdateState(Enum): + WAITING = 0 # waiting for updated expert_map by EplbWorker + READY = 1 # ready for d2d expert weights updating + TRANSFERING = 2 # d2d finished and waiting for updating expert_map into model + +class D2DExpertWeightLoader(ExpertWeightLoader): + + def __init__(self, eplb_adaptor): + self.comm_op_list = None + self.eplb_adaptor = eplb_adaptor + + self.updated_expert_map = None + self.layer_id = -1 # layer id to be updated + self.state = ExpertWeightUpdateState.WAITING + self.recv_expert_list = [] + self.mock_flag = True + + def generate_expert_d2d_transfer_task(self, expert_send_info, expert_recv_info, + updated_expert_map, layer_id): + # When current send/recv and weight.expert_map update tasks are not finished, cannot accept new d2d task + if self.state != ExpertWeightUpdateState.WAITING: + logger.error("current d2d weight update tasks are on-going, cannot accept new weight update task") + return + + # If neither send nor receive task is needed for this layer on this rank, return + if not (expert_send_info or expert_recv_info): + return + + self.updated_expert_map = updated_expert_map + + self.layer_id = layer_id + self.comm_op_list = [] + for send_info in expert_send_info: + dst_rank, global_expert_id_to_send = send_info + for src_tensor in self.eplb_adaptor.get_expert_tensor(layer_id, global_expert_id_to_send): + self.comm_op_list.append(dist.P2POp(dist.isend, src_tensor, dst_rank)) + + buffer_tensor_id = 0 + for recv_info in expert_recv_info: + recv_rank, global_expert_id_to_recv = recv_info + for buffer_tensor in self.eplb_adaptor.get_buffer_tensor(buffer_tensor_id): + self.comm_op_list.append(dist.P2POp(dist.irecv, buffer_tensor, recv_rank)) + self.recv_expert_list.append((global_expert_id_to_recv, buffer_tensor_id)) + buffer_tensor_id += 1 + + self.state = ExpertWeightUpdateState.READY + + def asyn_expert_weight_transfer(self, reqs): + # Only when send/recv tasks are parsed into self.comm_op_list, d2d send/recv tasks can be luanched + if self.state != ExpertWeightUpdateState.READY: + return + + # set asynchronous stream for d2d expert weight transfer + if self.comm_op_list: + reqs = dist.batch_isend_irecv(self.comm_op_list) + + self.state = ExpertWeightUpdateState.TRANSFERING + + def update_expert_map_and_weight(self, reqs): + # Only after send/recv tasks have been luanched, expert_map and weight can be updated + if self.state != ExpertWeightUpdateState.TRANSFERING: + return + + # Waiting for send/recv tasks finish + for req in reqs: + req.wait() + + if self.comm_op_list is not None: + self.comm_op_list = None + + # update expert_map + self.eplb_adaptor.do_update_expert_map(self.layer_id, self.updated_expert_map) + + # update expert weight + buffer_tensor_id = 0 + for recv_expert_info in self.recv_expert_list: + global_expert_id_to_recv, buffer_tensor_id = recv_expert_info + self.eplb_adaptor.do_update_expert_weight(self.layer_id, global_expert_id_to_recv, buffer_tensor_id) + + self.recv_expert_list = [] + self.updated_expert_map = None + self.layer_id = -1 + self.state = ExpertWeightUpdateState.WAITING + + def generate_mock_update_info(self, rank_id): + if rank_id == 0: + expert_send_info = [(1, 0)] + expert_recv_info = [(1, 64)] + updated_expert_map_list = [-1] + [i for i in range(1, 64)] + [0] + [j for j in [-1] * 191] + updated_expert_map = torch.tensor(updated_expert_map_list) + layer_id = 3 + + if rank_id == 1: + expert_send_info = [(0, 64)] + expert_recv_info = [(0, 0)] + updated_expert_map_list = [0] + [k for k in [-1] * 63] + [i for i in range(1, 64)] + [j for j in [-1] * 129] + updated_expert_map = torch.tensor(updated_expert_map_list) + layer_id = 3 + + if rank_id == 2: + expert_send_info = [(3, 128)] + expert_recv_info = [(3, 192)] + updated_expert_map_list = [k for k in [-1] * 129] + [i for i in range(1, 64)] + [0] + [j for j in [-1] * 63] + updated_expert_map = torch.tensor(updated_expert_map_list) + layer_id = 3 + + if rank_id == 3: + expert_send_info = [(2, 192)] + expert_recv_info = [(2, 128)] + updated_expert_map_list = [k for k in [-1] * 128] + [0] + [k for k in [-1] * 64] + [i for i in range(1, 64)] + updated_expert_map = torch.tensor(updated_expert_map_list) + layer_id = 3 + + self.mock_flag = False + return (expert_send_info, expert_recv_info, updated_expert_map, layer_id) + + def load_impl(self, old_expert_table, new_expert_table): + raise NotImplementedError \ No newline at end of file From 86fe2c027f47f4763c173233a8aab9ef20b4751e Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Tue, 10 Jun 2025 21:12:11 +0800 Subject: [PATCH 009/153] determine num_dense_layers and num_moe_layers by refering to model configuration --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index f4241a01fa..531ff205eb 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -27,6 +27,8 @@ def __init__(self, model, **args): super().__init__(**args) self.model = model self.param_dict = dict(self.model.named_parameters()) + self.num_dense_layers = self.model.config.first_k_dense_replace + self.num_moe_layers = self.model.config.num_hidden_layers - self.num_dense_layers # TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 is supported here self.expert_weight_names = ["w13_weight", "w2_weight", "w13_weight_scale", "w13_weight_offset", @@ -34,17 +36,16 @@ def __init__(self, model, **args): self.buffer_tensor_dict = dict() num_buffer_tensor = 100 # TO DO: provide number of buffer tensor by vllm configuration - params_dtype = torch.int8 # TO DO: provide number of buffer tensor by vllm configuration - self.init_buffer_tensor_dict(num_buffer_tensor, params_dtype) + self.init_buffer_tensor_dict(num_buffer_tensor) self.expert_map_per_layer = dict() - num_moe_layers = 2 # TO DO: provide number of num_moe_layers by vllm configuration - for layer_idx in range(num_moe_layers): - self.expert_map_per_layer[3 + layer_idx] = self.model.get_expert_map(3 + layer_idx) + for layer_idx in range(self.num_moe_layers): + self.expert_map_per_layer[self.num_dense_layers + layer_idx] =\ + self.model.get_expert_map(self.num_dense_layers + layer_idx) - def init_buffer_tensor_dict(self, num_buffer_tensor, params_dtype): + def init_buffer_tensor_dict(self, num_buffer_tensor): for name in self.expert_weight_names: - complete_name = "model.layers.3.mlp.experts." + name + complete_name = "model.layers." + str(self.num_dense_layers) + ".mlp.experts." + name expert_tensor = self.param_dict[complete_name].data[0:num_buffer_tensor] self.buffer_tensor_dict[name] = torch.empty_like(expert_tensor) From e68e522399a36c13cc8350730cbf6eebde222020 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Tue, 10 Jun 2025 21:25:42 +0800 Subject: [PATCH 010/153] EPLB add eplb_worker --- vllm_ascend/eplb/core/worker/eplb_worker.py | 261 ++++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 vllm_ascend/eplb/core/worker/eplb_worker.py diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py new file mode 100644 index 0000000000..581a509c96 --- /dev/null +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -0,0 +1,261 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import time +import numpy as np +import torch +import torch_npu +import logging +import torch.distributed as dist +from multiprocessing import Process, Queue, Manager +from abc import ABC, abstractmethod +from vllm.logger import logger + +from vllm_ascend.eplb.core.policy.policy_factory import PolicyFactory, DynamicConfig + + +class EplbWorker: + + def __init__(self, shared_dict, policy_type, enable_d2d: bool = True): + self.policy_type = policy_type + self.policy = PolicyFactory.generate_policy(policy_type, DynamicConfig()) + self.shared_dict = shared_dict + self.old_expert_maps = None + self.enable_d2d = enable_d2d + + def do_update(self): + # put data in to queue + # in process self.policy.generate_policy() + # get epxert table && tensor + + # async stream + # D2D + # H2D + + # Get initial expert_map + if self.old_expert_maps is None: + self.old_expert_maps = self.get_init_expert_maps() + self.num_local_experts = self.old_expert_maps.max() + 1 + + # Get MOE load information + load_info = self.fetch_and_sum_load_info() + if load_info is None: + return + + #根据负载信息,获取更新后的专家表 + load_info, old_placemet = self.global2local(load_info, self.old_expert_maps, self.num_local_experts) + changed, priority, new_placement = self.calculate_rebalance_experts(load_info, old_placemet) + + new_expert_maps = self.local2global(new_placement) + + logger.debug(f"[EPLB Process new_map differs, performing D2D") + + update_info = self.compose_expert_update_info(new_expert_maps, self.old_expert_maps) + self.old_expert_maps = new_expert_maps + logger.info("EPLB Process complete") + + return update_info + + # TODO: Here only expert weight exchange is considered, need to be extended to cover other weight update cases + def compose_expert_update_info(self, updated_expert_maps, current_expert_maps): + num_layers = current_expert_maps.shape[0] + num_ranks = current_expert_maps.shape[1] + num_experts = current_expert_maps.shape[2] + + for layer_id in range(num_layers): + updated_expert_maps_this_layer = updated_expert_maps[layer_id] + current_expert_maps_this_layer = current_expert_maps[layer_id] + + expert_send_info_this_layer = dict() + expert_recv_info_this_layer = dict() + + # Guard Clause: if there is no expert weight update, avoid subsequent processing + if torch.equal(updated_expert_maps_this_layer, current_expert_maps_this_layer): + yield (expert_send_info_this_layer, expert_recv_info_this_layer, updated_expert_maps_this_layer, layer_id) + + # Parse expert_ids each rank needs to receive from other ranks + dst_rank_indices, experts_to_recv = torch.where((current_expert_maps_this_layer == -1) \ + & (updated_expert_maps_this_layer != -1)) + + # Parse expert_ids each rank needs to send to other ranks + src_rank_indices, experts_to_send = torch.where((current_expert_maps_this_layer != -1) \ + & (updated_expert_maps_this_layer == -1)) + + for idx in range(len(dst_rank_indices)): + dst_rank_id = dst_rank_indices[idx].item() + expert_id = experts_to_recv[idx].item() + if dst_rank_id not in expert_recv_info_this_layer: + expert_recv_info_this_layer[dst_rank_id] = [] + + if not torch.isin(src_rank_indices, torch.tensor(expert_id)).any(): + # if expert_id are not sent out from any npu, it will be copied from one npu holding this expert + candidate_src_rank_indices = torch.where(current_expert_maps_this_layer[:, expert_id] != -1) + else: + candidate_src_rank_indices = src_rank_indices[experts_to_send == expert_id] + + #TODO: improve selection criterion of npu sending expert_id considering such as intra-node or inter-node... + src_rank_id = candidate_src_rank_indices[0].item() + if src_rank_id not in expert_send_info_this_layer: + expert_send_info_this_layer[src_rank_id] = [] + + expert_send_info_this_layer[src_rank_id].append((dst_rank_id, expert_id)) + expert_recv_info_this_layer[dst_rank_id].append((src_rank_id, expert_id)) + + yield (expert_send_info_this_layer, expert_recv_info_this_layer, updated_expert_maps_this_layer, layer_id) + + + def calculate_rebalance_experts(self, load_info, old_placement): + """ + 通过 policy 实例的 rebalance_experts 方法计算 new_map。 + """ + if self.old_expert_maps is None: + return False, None, None + + changed, priority, new_map = self.policy.rebalance_experts(old_placement, load_info) + return changed, priority, new_map + + def get_init_expert_maps(self): + """ + Read the initial expert_map from shared_dict. + """ + return self.shared_dict.get("expert_maps", None) + + def fetch_and_sum_load_info(self): + """ + Each time the subprocess is awakened, read the latest moe_load + (shape: [num_moe_layers, num_experts_per_layer]) from shared_dict. + """ + return self.shared_dict.get("moe_load", None) + + def update_expert_map(self, expert_maps): + + self.shared_dict["expert_maps"] = expert_maps + + def global2local(self, + workload: torch.Tensor, + placement: torch.Tensor, + E_local: int + ) -> tuple[torch.Tensor, torch.Tensor]: + + L, G, _ = placement.shape + device = placement.device + + wt_local = torch.full((L, G, E_local), + fill_value=-1, + dtype=workload.dtype, + device=device) + pt_local = torch.full((L, G, E_local), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement >= 0 + l_idx, g_idx, k_idx = valid.nonzero(as_tuple=True) + + slot_idx = placement[l_idx, g_idx, k_idx] + values = workload[l_idx, g_idx, k_idx] + + wt_local[l_idx, g_idx, slot_idx] = values + pt_local[l_idx, g_idx, slot_idx] = k_idx + + return wt_local, pt_local + + + def local2global(self, + placement_local: torch.Tensor + ) -> torch.Tensor: + + L, G, E_local = placement_local.shape + device = placement_local.device + + max_id = torch.max(placement_local) + E_global = (max_id + 1).item() if max_id >= 0 else 0 + + if E_global == 0: + return torch.empty((L, G, 0), dtype=torch.long, device=device) + + placement_global = torch.full((L, G, E_global), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement_local >= 0 + l_idx, g_idx, slot_idx = valid.nonzero(as_tuple=True) + gid_idx = placement_local[l_idx, g_idx, slot_idx] + + placement_global[l_idx, g_idx, gid_idx] = slot_idx + + return placement_global + + +class EplbProcess: + def __init__(self, shared_dict, planner_q, block_update_q, policy_type: int = 0, enable_d2d: bool = True): + """ + Args: + shared_dict: Cross-process shared dict returned by Manager().dict() + policy_type: Integer passed to PolicyFactory.generate_policy + enable_d2d: Whether to enable D2D loading + """ + self.shared_dict = shared_dict + self.policy_type = policy_type + self.enable_d2d = enable_d2d + self.planner_q = planner_q + self.block_update_q = block_update_q + + # Create EplbWorker instance + self.worker = EplbWorker(self.shared_dict, self.policy_type, self.enable_d2d) + + + def worker_process(self,planner_q,block_update_q): + """ + Subprocess entry: bind to specified NPU, loop waiting for planner_q to wake up, call do_update, then notify main process update is complete. + """ + while True: + try: + + planner_q.get() + + update_info = self.worker.do_update() + + print("update_info:", update_info) + + for (a,b,c,d) in update_info: + while True: + if not block_update_q.empty(): + continue + block_update_q.put((a,b,c,d)) + break + + print("EPLB Process complete") + + except Exception as e: + logger.warning(f"[EPLB subprocess Exiting due to error: {e}", exc_info=True) + break + + def _launch_process(self): + """ + Use spawn method to launch subprocess and return (planner_q, block_update_q, proc). + """ + proc = Process( + target=self.worker_process, + args=(self.planner_q,self.block_update_q), + daemon=True + ) + + proc.start() + return proc + From d6391449c358b0e4043338f4f9069eee820c4d21 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Tue, 10 Jun 2025 21:33:01 +0800 Subject: [PATCH 011/153] add ssd loader --- vllm_ascend/eplb/core/loader/ssd_loader.py | 295 +++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 vllm_ascend/eplb/core/loader/ssd_loader.py diff --git a/vllm_ascend/eplb/core/loader/ssd_loader.py b/vllm_ascend/eplb/core/loader/ssd_loader.py new file mode 100644 index 0000000000..2c00c13a85 --- /dev/null +++ b/vllm_ascend/eplb/core/loader/ssd_loader.py @@ -0,0 +1,295 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import os +import json +import re +import time +import queue +import math +import multiprocessing as mp +from dataclasses import dataclass +from typing import Dict, List, Tuple, Optional, Generator, Any + +import torch +import torch_npu +from safetensors import safe_open +from vllm_ascend.eplb.core.loader.abstract_loader import ExpertWeightLoader + + +def log_debug(msg: str): + print(f"[DEBUG] {msg}") + + +class SSDExpertWeightLoader: + + """ + Load all tensors that belong to one (layer, expert_id) pair, CPU only. + """ + + def __init__(self, load_config: Optional[dict] = None): + self.load_config = load_config or {} + self.counter_before_loading_weights = 0.0 + + + def _prepare_index( + self, + model_dir: str, + index_file: str + ) -> Dict[str, str]: + index_path = os.path.join(model_dir, index_file) + if not os.path.exists(index_path): + raise FileNotFoundError(index_path) + with open(index_path, "r", encoding="utf-8") as f: + data = json.load(f) + if "weight_map" not in data: + raise KeyError("weight_map missing in index file") + return data["weight_map"] + + + def load_expert_weight_from_ssd( + self, + model_dir: str, + layer_prefix: str, + expert_id: int, + index_file: str = "model.safetensors.index.json" + ) -> Dict[str, torch.Tensor]: + index_map = self._prepare_index(model_dir, index_file) + + # collect parameter names under the target expert + prefix = f"{layer_prefix}.experts.{expert_id}." + param_to_shard = { + name: shard for name, shard in index_map.items() + if name.startswith(prefix) + } + if not param_to_shard: + raise KeyError(f"expert {expert_id} not found under {layer_prefix}") + + # group by shard + shard_to_params: Dict[str, List[str]] = {} + for p, s in param_to_shard.items(): + shard_to_params.setdefault(s, []).append(p) + + result: Dict[str, torch.Tensor] = {} + for shard, params in shard_to_params.items(): + shard_path = os.path.join(model_dir, shard) + if not os.path.exists(shard_path): + raise FileNotFoundError(shard_path) + with safe_open(shard_path, framework="pt", device="cpu") as reader: + for p in params: + result[p] = reader.get_tensor(p) + return result + +class EplbLoaderProcess(mp.Process): + """ + Independent process for blocking SSD reads. + """ + + def __init__( + self, + model_dir: str, + req_q: mp.Queue, + res_q: mp.Queue, + quit_evt: mp.Event, + index_file: str = "quant_model_weight_w8a8_dynamic.safetensors.index.json" + ): + super().__init__(daemon=True) + self.model_dir = model_dir + self.req_q = req_q + self.res_q = res_q + self.quit_evt = quit_evt + self.index_file = index_file + self.loader = SSDExpertWeightLoader() + + # ---------- process loop -------------------------------------------------# + def run(self) -> None: + while not self.quit_evt.is_set(): + try: + job = self.req_q.get(timeout=0.1) + except queue.Empty: + continue + + if job["type"] == "shutdown": + break + + try: + job_id: int = job["job_id"] + layer_prefix, expert_id = job["layer_prefix"], job["expert_id"] + + tensors = self.loader.load_expert_weight_from_ssd( + self.model_dir, + layer_prefix=layer_prefix, + expert_id=expert_id, + index_file=self.index_file + ) + self.res_q.put((job_id, tensors, None)) + except Exception as exc: + self.res_q.put((job["job_id"], None, str(exc))) + + @staticmethod + def _rpc( + layer_prefix: str, + expert_id: int, + model_dir: str, + index_file: str, + req_q: mp.Queue, + res_q: mp.Queue, + timeout: float = 30.0 + ) -> Dict[str, torch.Tensor]: + job_id = time.time_ns() + req_q.put(dict( + type="load", + job_id=job_id, + layer_prefix=layer_prefix, + expert_id=expert_id + )) + t0 = time.time() + while True: + try: + jid, tensors, err = res_q.get(timeout=0.1) + except queue.Empty: + if time.time() - t0 > timeout: + raise RuntimeError("EPLB I/O RPC timeout") + continue + if jid != job_id: + res_q.put((jid, tensors, err)) + continue + if err: + raise RuntimeError(err) + return tensors + + +@dataclass +class BatchMeta: + layer_ids: List[int] + tensors: List[torch.Tensor] # CPU tensors in this batch + + +class EplbRebalanceLoader: + + def __init__( + self, + model_dir: str, + num_layers: int, + first_dense_layers: int, + experts_per_layer: int, + layer_group_size: int = 8, + index_file: str = "quant_model_weight_w8a8_dynamic.safetensors.index.json" + enable_subprocess: bool = True + ): + self.model_dir = model_dir + self.first_dense = first_dense_layers + self.num_moe_layers = num_layers - first_dense_layers + self.experts_per_layer = experts_per_layer + self.layer_group_size = layer_group_size + self.index_file = index_file + + self.cpu_buf: List[torch.Tensor] = [] + self.npu_buf: List[torch.Tensor] = [] + + # multi-process I/O + self.enable_mp = enable_subprocess + if self.enable_mp: + self.req_q: mp.Queue = mp.Queue() + self.res_q: mp.Queue = mp.Queue() + self.quit_evt = mp.Event() + self.proc = EplbLoaderProcess( + model_dir, + self.req_q, + self.res_q, + self.quit_evt + ) + self.proc.start() + + def _load_one_expert_cpu(self, layer_prefix: str, expert_id: int) -> Dict[str, torch.Tensor]: + if self.enable_mp: + return EplbLoaderProcess._rpc( + layer_prefix=layer_prefix, + expert_id=expert_id, + model_dir=self.model_dir, + index_file=index_file, + req_q=self.req_q, + res_q=self.res_q + ) + + loader = SSDExpertWeightLoader() + return loader.load_expert_weight_from_ssd( + self.model_dir, + layer_prefix=layer_prefix, + expert_id=expert_id + ) + + def load_batch(self, batch_idx: int, expert_ids: List[int]) -> BatchMeta: + + start_layer = batch_idx * self.layer_group_size + self.first_dense + end_layer = min(start_layer + self.layer_group_size, + self.first_dense + self.num_moe_layers) + + tensors: List[torch.Tensor] = [] + layer_ids: List[int] = list(range(start_layer, end_layer)) + + for lid in layer_ids: + prefix = f"model.layers.{lid}.mlp" + for eid in expert_ids: + layer_pref = prefix + loaded = self._load_one_expert_cpu(layer_pref, eid) + tensors.extend(loaded.values()) + # keep CPU buffer for later copy + self.cpu_buf = tensors + return BatchMeta(layer_ids, tensors) + + def host_to_npu(self): + if not self.cpu_buf: + raise RuntimeError("CPU buffer is empty, call load_batch first") + + if not self.npu_buf: + self.npu_buf = [t.npu() for t in self.cpu_buf] + else: + for dst, src in zip(self.npu_buf, self.cpu_buf): + dst.copy_(src) + + def shutdown(self): + if self.enable_mp: + self.quit_evt.set() + self.req_q.put(dict(type="shutdown")) + self.proc.join(timeout=2.0) + log_debug("EplbLoaderProcess terminated") + + +if __name__ == "__main__": + + index_file = 'quant_model_weight_w8a8_dynamic.safetensors.index.json' + + loader = EplbRebalanceLoader( + model_dir="/home/data/eplb_vllm/DeepSeek-V3-W8A8", + num_layers=24, + first_dense_layers=3, + experts_per_layer=4, + layer_group_size=4, + enable_subprocess=True, + index_file = index_file + ) + + # load batch 0 (layers 2-5) for experts 0 and 2 + meta = loader.load_batch(batch_idx=0, expert_ids=[0, 2]) + log_debug(f"CPU tensors in batch: {len(meta.tensors)}") + + # copy to NPU + loader.host_to_npu() + log_debug(f"NPU buffer tensors: {len(loader.npu_buf)}") + + loader.shutdown() From f1f936b3efbd456bc9576880f15bd5958c164972 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Tue, 10 Jun 2025 22:02:51 +0800 Subject: [PATCH 012/153] EPLB moed load collect --- vllm_ascend/eplb/core/loader/ssd_loader.py | 295 --------------------- vllm_ascend/models/deepseek_v2.py | 33 +++ vllm_ascend/ops/fused_moe.py | 28 +- vllm_ascend/quantization/w8a8_dynamic.py | 6 +- vllm_ascend/worker/model_runner_v1.py | 29 ++ 5 files changed, 92 insertions(+), 299 deletions(-) delete mode 100644 vllm_ascend/eplb/core/loader/ssd_loader.py diff --git a/vllm_ascend/eplb/core/loader/ssd_loader.py b/vllm_ascend/eplb/core/loader/ssd_loader.py deleted file mode 100644 index 2c00c13a85..0000000000 --- a/vllm_ascend/eplb/core/loader/ssd_loader.py +++ /dev/null @@ -1,295 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -import os -import json -import re -import time -import queue -import math -import multiprocessing as mp -from dataclasses import dataclass -from typing import Dict, List, Tuple, Optional, Generator, Any - -import torch -import torch_npu -from safetensors import safe_open -from vllm_ascend.eplb.core.loader.abstract_loader import ExpertWeightLoader - - -def log_debug(msg: str): - print(f"[DEBUG] {msg}") - - -class SSDExpertWeightLoader: - - """ - Load all tensors that belong to one (layer, expert_id) pair, CPU only. - """ - - def __init__(self, load_config: Optional[dict] = None): - self.load_config = load_config or {} - self.counter_before_loading_weights = 0.0 - - - def _prepare_index( - self, - model_dir: str, - index_file: str - ) -> Dict[str, str]: - index_path = os.path.join(model_dir, index_file) - if not os.path.exists(index_path): - raise FileNotFoundError(index_path) - with open(index_path, "r", encoding="utf-8") as f: - data = json.load(f) - if "weight_map" not in data: - raise KeyError("weight_map missing in index file") - return data["weight_map"] - - - def load_expert_weight_from_ssd( - self, - model_dir: str, - layer_prefix: str, - expert_id: int, - index_file: str = "model.safetensors.index.json" - ) -> Dict[str, torch.Tensor]: - index_map = self._prepare_index(model_dir, index_file) - - # collect parameter names under the target expert - prefix = f"{layer_prefix}.experts.{expert_id}." - param_to_shard = { - name: shard for name, shard in index_map.items() - if name.startswith(prefix) - } - if not param_to_shard: - raise KeyError(f"expert {expert_id} not found under {layer_prefix}") - - # group by shard - shard_to_params: Dict[str, List[str]] = {} - for p, s in param_to_shard.items(): - shard_to_params.setdefault(s, []).append(p) - - result: Dict[str, torch.Tensor] = {} - for shard, params in shard_to_params.items(): - shard_path = os.path.join(model_dir, shard) - if not os.path.exists(shard_path): - raise FileNotFoundError(shard_path) - with safe_open(shard_path, framework="pt", device="cpu") as reader: - for p in params: - result[p] = reader.get_tensor(p) - return result - -class EplbLoaderProcess(mp.Process): - """ - Independent process for blocking SSD reads. - """ - - def __init__( - self, - model_dir: str, - req_q: mp.Queue, - res_q: mp.Queue, - quit_evt: mp.Event, - index_file: str = "quant_model_weight_w8a8_dynamic.safetensors.index.json" - ): - super().__init__(daemon=True) - self.model_dir = model_dir - self.req_q = req_q - self.res_q = res_q - self.quit_evt = quit_evt - self.index_file = index_file - self.loader = SSDExpertWeightLoader() - - # ---------- process loop -------------------------------------------------# - def run(self) -> None: - while not self.quit_evt.is_set(): - try: - job = self.req_q.get(timeout=0.1) - except queue.Empty: - continue - - if job["type"] == "shutdown": - break - - try: - job_id: int = job["job_id"] - layer_prefix, expert_id = job["layer_prefix"], job["expert_id"] - - tensors = self.loader.load_expert_weight_from_ssd( - self.model_dir, - layer_prefix=layer_prefix, - expert_id=expert_id, - index_file=self.index_file - ) - self.res_q.put((job_id, tensors, None)) - except Exception as exc: - self.res_q.put((job["job_id"], None, str(exc))) - - @staticmethod - def _rpc( - layer_prefix: str, - expert_id: int, - model_dir: str, - index_file: str, - req_q: mp.Queue, - res_q: mp.Queue, - timeout: float = 30.0 - ) -> Dict[str, torch.Tensor]: - job_id = time.time_ns() - req_q.put(dict( - type="load", - job_id=job_id, - layer_prefix=layer_prefix, - expert_id=expert_id - )) - t0 = time.time() - while True: - try: - jid, tensors, err = res_q.get(timeout=0.1) - except queue.Empty: - if time.time() - t0 > timeout: - raise RuntimeError("EPLB I/O RPC timeout") - continue - if jid != job_id: - res_q.put((jid, tensors, err)) - continue - if err: - raise RuntimeError(err) - return tensors - - -@dataclass -class BatchMeta: - layer_ids: List[int] - tensors: List[torch.Tensor] # CPU tensors in this batch - - -class EplbRebalanceLoader: - - def __init__( - self, - model_dir: str, - num_layers: int, - first_dense_layers: int, - experts_per_layer: int, - layer_group_size: int = 8, - index_file: str = "quant_model_weight_w8a8_dynamic.safetensors.index.json" - enable_subprocess: bool = True - ): - self.model_dir = model_dir - self.first_dense = first_dense_layers - self.num_moe_layers = num_layers - first_dense_layers - self.experts_per_layer = experts_per_layer - self.layer_group_size = layer_group_size - self.index_file = index_file - - self.cpu_buf: List[torch.Tensor] = [] - self.npu_buf: List[torch.Tensor] = [] - - # multi-process I/O - self.enable_mp = enable_subprocess - if self.enable_mp: - self.req_q: mp.Queue = mp.Queue() - self.res_q: mp.Queue = mp.Queue() - self.quit_evt = mp.Event() - self.proc = EplbLoaderProcess( - model_dir, - self.req_q, - self.res_q, - self.quit_evt - ) - self.proc.start() - - def _load_one_expert_cpu(self, layer_prefix: str, expert_id: int) -> Dict[str, torch.Tensor]: - if self.enable_mp: - return EplbLoaderProcess._rpc( - layer_prefix=layer_prefix, - expert_id=expert_id, - model_dir=self.model_dir, - index_file=index_file, - req_q=self.req_q, - res_q=self.res_q - ) - - loader = SSDExpertWeightLoader() - return loader.load_expert_weight_from_ssd( - self.model_dir, - layer_prefix=layer_prefix, - expert_id=expert_id - ) - - def load_batch(self, batch_idx: int, expert_ids: List[int]) -> BatchMeta: - - start_layer = batch_idx * self.layer_group_size + self.first_dense - end_layer = min(start_layer + self.layer_group_size, - self.first_dense + self.num_moe_layers) - - tensors: List[torch.Tensor] = [] - layer_ids: List[int] = list(range(start_layer, end_layer)) - - for lid in layer_ids: - prefix = f"model.layers.{lid}.mlp" - for eid in expert_ids: - layer_pref = prefix - loaded = self._load_one_expert_cpu(layer_pref, eid) - tensors.extend(loaded.values()) - # keep CPU buffer for later copy - self.cpu_buf = tensors - return BatchMeta(layer_ids, tensors) - - def host_to_npu(self): - if not self.cpu_buf: - raise RuntimeError("CPU buffer is empty, call load_batch first") - - if not self.npu_buf: - self.npu_buf = [t.npu() for t in self.cpu_buf] - else: - for dst, src in zip(self.npu_buf, self.cpu_buf): - dst.copy_(src) - - def shutdown(self): - if self.enable_mp: - self.quit_evt.set() - self.req_q.put(dict(type="shutdown")) - self.proc.join(timeout=2.0) - log_debug("EplbLoaderProcess terminated") - - -if __name__ == "__main__": - - index_file = 'quant_model_weight_w8a8_dynamic.safetensors.index.json' - - loader = EplbRebalanceLoader( - model_dir="/home/data/eplb_vllm/DeepSeek-V3-W8A8", - num_layers=24, - first_dense_layers=3, - experts_per_layer=4, - layer_group_size=4, - enable_subprocess=True, - index_file = index_file - ) - - # load batch 0 (layers 2-5) for experts 0 and 2 - meta = loader.load_batch(batch_idx=0, expert_ids=[0, 2]) - log_debug(f"CPU tensors in batch: {len(meta.tensors)}") - - # copy to NPU - loader.host_to_npu() - log_debug(f"NPU buffer tensors: {len(loader.npu_buf)}") - - loader.shutdown() diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 96c76338bb..e48df0508a 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -723,6 +723,39 @@ def forward( inputs_embeds) return hidden_states + def update_expert_map(self,new_expert_map,layer_id): + self.model.layers[layer_id].mlp.experts.update_map(new_expert_map) + + def update_all_expert_map(self, new_expert_map,num_moe_layers): + num_moe_layers = len(new_expert_map) + for layer_id in range(num_moe_layers): + layer_map = new_expert_map[layer_id].to("npu") + self.model.layers[3+layer_id].mlp.experts.update_map(layer_map) + + def get_expert_map(self,layer_id): + return self.model.layers[layer_id].mlp.experts.get_map() + + def get_all_expert_map(self,num_moe_layers): + all_loads = [] + for layer_id in range(num_moe_layers): + load_tensor = self.get_expert_map(3+layer_id) # (num_experts_per_layer,) + all_loads.append(load_tensor) + + return torch.stack(all_loads, dim=0) + + def get_moe_load(self,layer_id): + return self.model.layers[layer_id].mlp.experts.get_moe_load() + + def get_all_moe_loads(self, num_moe_layers) -> torch.Tensor: + """ + output: [num_moe_layers, num_experts_per_layer] + """ + all_loads = [] + for layer_id in range(num_moe_layers): + load_tensor = self.get_moe_load(3+layer_id) # (num_experts_per_layer,) + all_loads.append(load_tensor) + + return torch.stack(all_loads, dim=0) class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM): pass diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 25f3b05d59..3a391653fb 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -991,6 +991,8 @@ def __init__( AscendFusedMoE.moe_counter += 1 self.moe_instance_id = AscendFusedMoE.moe_counter + self.moe_load = None + self.topk_ids = None if params_dtype is None: params_dtype = torch.get_default_dtype() @@ -1132,7 +1134,7 @@ def forward(self, hidden_states, router_logits) # Matrix multiply. - hidden_states = self.quant_method.apply( + hidden_states, self.topk_ids = self.quant_method.apply( layer=self, x=hidden_states, router_logits=router_logits, @@ -1152,6 +1154,8 @@ def forward(self, global_redundant_expert_num=self.global_redundant_expert_num, **kwargs) + self.calculate_moe_load() + if self.enable_multistream_shared_expert and not is_prefill: hidden_states, shared_output = hidden_states @@ -1209,3 +1213,25 @@ def _forward_ms_fused_moe_comp( enable_force_load_balance=enable_force_load_balance) return hidden_states + + def update_map(self,new_expert_map): + self.expert_map = new_expert_map + + def get_map(self): + return self.expert_map + + def get_moe_load(self): + return self.moe_load + + def calculate_moe_load(self): + if self.moe_load is None: + self.moe_load = torch.zeros(self.num_experts, + dtype=torch.int64, + device=self.topk_ids.device) + + ids = self.topk_ids.flatten().to(torch.int64) + + ones = torch.ones_like(ids, dtype=torch.int64, device=ids.device) + self.moe_load.scatter_add_(0, ids, ones) + + return self.moe_load \ No newline at end of file diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index c6e863ff71..049b87f025 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -706,7 +706,7 @@ def apply( moe_all_to_all_group_name=self.moe_all_to_all_group_name, log2phy=log2phy, global_redundant_expert_num=global_redundant_expert_num, - **kwargs) + **kwargs), topk_ids elif self.torchair_graph_enabled or self.ep_group.world_size == 1: return fused_experts(hidden_states=x, w1=layer.w13_weight, @@ -716,7 +716,7 @@ def apply( topk_weights=topk_weights, topk_ids=topk_ids, top_k=top_k, - expert_map=expert_map) + expert_map=expert_map), topk_ids else: # The current implementation of deepseek moe splits hidden_states # according to tp_size before they are feed into fused_moe module. @@ -735,7 +735,7 @@ def apply( ep_group=self.ep_group, log2phy=log2phy, global_redundant_expert_num=global_redundant_expert_num, - ) + ), topk_ids def process_weights_after_loading(self, layer): if self.transpose_weight: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 2b343d7f7d..df5971c26e 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -25,6 +25,7 @@ from contextlib import contextmanager, nullcontext from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Union +from multiprocessing import Manager import numpy as np import numpy.typing as npt @@ -77,6 +78,10 @@ from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer +from vllm_ascend.eplb.eplb_updator import EplbUpdator +from vllm_ascend.eplb.adaptor.vllm_adaptor import VllmEplbAdaptor +from vllm_ascend.eplb.core.loader.device_transfer_loader import D2DExpertWeightLoader + if TYPE_CHECKING: import xgrammar as xgr # type: ignore[import-untyped] from vllm.v1.core.sched.output import SchedulerOutput @@ -363,6 +368,20 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.dp_size = vllm_config.parallel_config.data_parallel_size self.dp_rank = vllm_config.parallel_config.data_parallel_rank + #EPLB + self.enable_eplb = True + # if additional_config: + # self.enable_torchair_graph_mode = additional_config.get( + # "enable_graph_mode", + # False) and self.vllm_config.model_config.use_mla + # self.use_cached_npu_graph = additional_config.get( + # "use_cached_npu_graph", False) + # self.enable_eplb = additional_config.get("enable_eplb", False) + + if self.enable_eplb == True: + self.eplb_adaptor = None + self.eplb_updator = EplbUpdator() + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler output. @@ -1205,6 +1224,13 @@ def execute_model( scheduler_output: "SchedulerOutput", intermediate_tensors: Optional[IntermediateTensors] = None, ) -> Union[ModelRunnerOutput, torch.Tensor]: + + if self.enable_eplb: + if self.eplb_adaptor == None: + self.eplb_adaptor = VllmEplbAdaptor(model=self.model) + self.eplb_updator.set_adaptor(self.eplb_adaptor) + self.eplb_updator.forward_before() + with ProfileExecuteDuration().capture_async( "prepare input and forward"): self._update_states(scheduler_output) @@ -1320,6 +1346,9 @@ def execute_model( print(f"Profile execute duration [{captured_name}]:", " ".join(dr_str)) + if self.enable_eplb: + self.eplb_updator.forward_end() + return model_runner_output def _profile_multimodal(self) -> None: From bd924f2eecd27cd63b3d36cee2e22c79c6424599 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Tue, 10 Jun 2025 22:03:52 +0800 Subject: [PATCH 013/153] delete invalida import --- vllm_ascend/worker/model_runner_v1.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index df5971c26e..d749d5bb7d 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -25,7 +25,6 @@ from contextlib import contextmanager, nullcontext from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Union -from multiprocessing import Manager import numpy as np import numpy.typing as npt From 291c216898c989efb4b59a64dad818d2dd48a71c Mon Sep 17 00:00:00 2001 From: Pleaplusone <38376071+ganyi1996ppo@users.noreply.github.com> Date: Tue, 10 Jun 2025 22:20:40 +0800 Subject: [PATCH 014/153] fix torchair execute issue on padding data, and mtp padding logic (#1160) ### What this PR does / why we need it? The former PR https://github.com/vllm-project/vllm-ascend/pull/736 select the valid token inside the `input_ids` and `position_ids` breaks the necessary padding required by torchair. In this PR, we pending the pad logic after the multimodal part. Signed-off-by: ganyi --- vllm_ascend/attention/mla_v1.py | 5 ++++- vllm_ascend/worker/model_runner_v1.py | 10 +++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 226a5705cd..4a8b5902d0 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -376,7 +376,10 @@ def build( seq_lens = seq_lens[:self._num_decode_tokens] input_positions = input_positions[:self._num_decode_tokens] block_table = block_table[:self._num_decode_tokens, ...] - if use_torchair_graph and self.runner.attn_state == AscendAttentionState.DecodeOnly: + if use_torchair_graph and self.runner.attn_state in [ + AscendAttentionState.DecodeOnly, + AscendAttentionState.SpecDecoding + ]: num_seqs = len(seq_lens) if graph_pad_size != 0: pad_value = 1 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 2b343d7f7d..51ffe1ec2d 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -943,11 +943,6 @@ def _process_reqs( self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) input_ids = self.input_ids[:num_input_tokens] - if (envs_ascend.VLLM_ENABLE_MC2 - or self.torchair_graph_enabled) and not with_prefill: - input_ids = self.input_ids[:padded_batch_size] - positions = self.positions[:padded_batch_size] - # prepare the MRoPE for mllm if using multimodal num_input_tokens = total_num_scheduled_tokens # _prepare_inputs may reorder the batch, so we must gather multi @@ -985,6 +980,11 @@ def _process_reqs( else: positions = self.positions[:num_input_tokens] + if (envs_ascend.VLLM_ENABLE_MC2 + or self.torchair_graph_enabled) and not with_prefill: + input_ids = self.input_ids[:padded_batch_size] + positions = self.positions[:padded_batch_size] + # Run forward pass with set_forward_context(attn_metadata, self.vllm_config, From 8dd686dfa20d8f577cd63fb751ce0a7a98f2f516 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Tue, 10 Jun 2025 22:26:53 +0800 Subject: [PATCH 015/153] [MLA][Graph] Improve assertion on Graph mode with MLA (#933) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Improve assertion on Graph mode with MLA. When running deepseek with graph mode, the fused MLA op only support `numHeads / numKvHeads ∈ {32, 64, 128}`, thus we improve the assertion info here to avoid users confused with this. ### Does this PR introduce _any_ user-facing change? Adjusting tp size is required when running deepseek-v3/r1 with graph mode. deepseek-v2-lite is not supported in graph mode. ### How was this patch tested? Test locally as the CI machine could not run V3 due to the HBM limits. --------- Signed-off-by: MengqingCao --- docs/source/faqs.md | 10 ++++++++++ vllm_ascend/attention/attention.py | 11 +++++++++++ vllm_ascend/attention/mla_v1.py | 11 +++++++++++ vllm_ascend/worker/multi_step_worker.py | 2 +- 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/docs/source/faqs.md b/docs/source/faqs.md index 1d355b5e9a..1de3befb2d 100644 --- a/docs/source/faqs.md +++ b/docs/source/faqs.md @@ -113,3 +113,13 @@ In scenarios where NPUs have limited HBM (High Bandwidth Memory) capacity, dynam - **Adjust `--gpu-memory-utilization`**: If unspecified, will use the default value of `0.9`. You can decrease this param to reserve more memory to reduce fragmentation risks. See more note in: [vLLM - Inference and Serving - Engine Arguments](https://docs.vllm.ai/en/latest/serving/engine_args.html#vllm.engine.arg_utils-_engine_args_parser-cacheconfig). - **Configure `PYTORCH_NPU_ALLOC_CONF`**: Set this environment variable to optimize NPU memory management. For example, you can `export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True` to enable virtual memory feature to mitigate memory fragmentation caused by frequent dynamic memory size adjustments during runtime, see more note in: [PYTORCH_NPU_ALLOC_CONF](https://www.hiascend.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html). + +### 15. Failed to enable NPU graph mode when running DeepSeek? +You may encounter the following error if running DeepSeek with NPU graph mode enabled. The allowed number of queries per kv when enabling both MLA and Graph mode only support {32, 64, 128}, **Thus this is not supported for DeepSeek-V2-Lite**, as it only has 16 attention heads. The NPU graph mode support on DeepSeek-V2-Lite will be done in the future. + +And if you're using DeepSeek-V3 or DeepSeek-R1, please make sure after the tensor parallel split, num_heads / num_kv_heads in {32, 64, 128}. + +```bash +[rank0]: RuntimeError: EZ9999: Inner Error! +[rank0]: EZ9999: [PID: 62938] 2025-05-27-06:52:12.455.807 numHeads / numKvHeads = 8, MLA only support {32, 64, 128}.[FUNC:CheckMlaAttrs][FILE:incre_flash_attention_tiling_check.cc][LINE:1218] +``` diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py index 8f130e4241..a567cc5306 100644 --- a/vllm_ascend/attention/attention.py +++ b/vllm_ascend/attention/attention.py @@ -40,6 +40,8 @@ from vllm_ascend.worker.model_runner import ( ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata) +_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128] + def generate_attn_mask(max_seq_len: int, dtype=torch.float16, mask_value=None): # Construct lower triangle matrix. @@ -1005,6 +1007,15 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + # TODO: support numHeads / numKvHeads < 16 in MLA kernel + if self.torchair_graph_enabled: + assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \ + ("The allowed number of queries per kv when enabling both MLA and Graph mode" + " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite," + " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1," + " please make sure after the tensor parallel split, num_heads / num_kv_heads in " + "{32, 64, 128}.") + def exec_kv( self, hidden_states: torch.Tensor, diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 4a8b5902d0..7f603422c8 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -13,6 +13,7 @@ UnquantizedLinearMethod) from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig from vllm_ascend.multistream.context import get_multistream_comm_context @@ -475,6 +476,7 @@ def __init__( self.o_proj = kwargs['o_proj'] self.kv_a_proj_with_mqa = kwargs.get('kv_a_proj_with_mqa', None) self.kv_a_layernorm = kwargs.get('kv_a_layernorm', None) + self.num_queries_per_kv = self.num_heads // self.num_kv_heads ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled @@ -484,6 +486,15 @@ def __init__( self.spec_token_num = speculative_config.num_speculative_tokens assert self.spec_token_num > 0 + # TODO: support numHeads / numKvHeads < 16 in MLA kernel + if self.torchair_graph_enabled: + assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \ + ("The allowed number of queries per kv when enabling both MLA and Graph mode" + " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite," + " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1," + " please make sure after the tensor parallel split, num_heads / num_kv_heads in " + "{32, 64, 128}.") + def _v_up_proj_and_o_proj(self, x): # Convert from (B, N, L) to (N, B, L) x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) diff --git a/vllm_ascend/worker/multi_step_worker.py b/vllm_ascend/worker/multi_step_worker.py index ba83f6b962..6d092805d5 100644 --- a/vllm_ascend/worker/multi_step_worker.py +++ b/vllm_ascend/worker/multi_step_worker.py @@ -119,7 +119,7 @@ def _prepare_last_sampled_token_ids_for_tp_workers( # execute_model_req assert execute_model_req.last_sampled_token_ids is not None model_input.last_sampled_token_ids = ( - execute_model_req.last_sampled_token_ids.cuda()) + execute_model_req.last_sampled_token_ids.npu()) model_input.add_sampler_output( SamplerOutput(outputs=[], sampled_token_ids=None), model_input.last_sampled_token_ids) From 8b48daaa44e11b26024a52705d3d3d4056eb57b6 Mon Sep 17 00:00:00 2001 From: 22dimensions Date: Wed, 11 Jun 2025 06:18:32 +0800 Subject: [PATCH 016/153] [CI] rename Qwen2.5-0.5B-Instruct-W8A8 model (#1145) 1. rename vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new to vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8 Signed-off-by: 22dimensions --- tests/singlecard/test_offline_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py index 5b58df7928..de69612279 100644 --- a/tests/singlecard/test_offline_inference.py +++ b/tests/singlecard/test_offline_inference.py @@ -39,7 +39,7 @@ MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"] QUANTIZATION_MODELS = [ - "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8-new", + "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8", ] os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" From 04abfd8721d8cc8434b820ea4497cad6f1af6481 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Wed, 11 Jun 2025 07:31:13 +0800 Subject: [PATCH 017/153] [CI] Skip test_v1_spec_decode.py::test_ngram_correctness to make longterm CI pass (#1163) [CI] Skip test_v1_spec_decode.py::test_ngram_correctness to make longterm CI pass Related: https://github.com/vllm-project/vllm-ascend/issues/1162 Signed-off-by: MengqingCao --- .github/workflows/vllm_ascend_test_long_term.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index 390dbfe70c..2cc89172fc 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -97,7 +97,8 @@ jobs: if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then # spec decode test VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py - VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_spec_decode.py + # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed + # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_spec_decode.py VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process pytest -sv tests/long_term/spec_decode --ignore=tests/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py pytest -sv tests/long_term/test_accuracy.py From 7bdc606677705d072c1dc45f050a3c3471d6d379 Mon Sep 17 00:00:00 2001 From: sdmyzlp <117554856+sdmyzlp@users.noreply.github.com> Date: Wed, 11 Jun 2025 09:18:38 +0800 Subject: [PATCH 018/153] Support multistream of shared experts in FusedMoE (#997) Contains on #1111 for completeness. ### What this PR does / why we need it? Implement multi-stream parallelism for MoE layers with shared experts, where computation of shared experts will be overlapped with expert token dispatch and combine. Also, when multi-stream is enabled, weights of shared experts will be force to replicate across all cards, regardless of any tensor parallelism configurations, to avoid AllReduce operations. With the expected overlaping being: ``` | shared gate_up | shared act | | shared down | | dispatch | routed gate_up, act, down | combine | ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Tested on 1x16 910 node, with tailored 2 layer DSKv2. --------- Signed-off-by: sdmyzlp --- .github/workflows/vllm_ascend_test.yaml | 2 + docs/source/user_guide/additional_config.md | 4 +- mypy.ini | 3 + .../test_offline_inference_distributed.py | 19 +- tests/singlecard/test_ascend_config.py | 4 +- vllm_ascend/ascend_config.py | 4 +- vllm_ascend/models/deepseek_dbo.py | 110 +-------- vllm_ascend/models/deepseek_v2.py | 217 +++++++++++------- vllm_ascend/ops/fused_moe.py | 77 ++++--- vllm_ascend/quantization/w8a8_dynamic.py | 142 +++++------- vllm_ascend/utils.py | 22 +- 11 files changed, 296 insertions(+), 308 deletions(-) diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 32524eecac..b02350234c 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -188,6 +188,7 @@ jobs: VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py fi @@ -218,5 +219,6 @@ jobs: VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py fi diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index c6558f4b61..b769a31ab9 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -39,11 +39,11 @@ The details of each config option are as follows: | Name | Type | Default | Description | | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable torchair graph mode | +| `enable_multistream_moe`| bool | `False` | Whether to enable multistream shared expert | | `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization | | `use_cached_graph` | bool | `False` | Whether to use cached graph | | `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache | | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty | -| `enable_multistream_shared_expert`| bool | `False` | Whether to enable multistream shared expert | **ascend_scheduler_config** @@ -64,7 +64,7 @@ A full example of additional configuration is as follows: "use_cached_graph": true, "graph_batch_sizes": [1, 2, 4, 8], "graph_batch_sizes_init": false, - "enable_multistream_shared_expert": false + "enable_multistream_moe": false }, "ascend_scheduler_config": { "enabled": true, diff --git a/mypy.ini b/mypy.ini index 72b03de21c..6fe8e6c298 100644 --- a/mypy.ini +++ b/mypy.ini @@ -6,6 +6,9 @@ warn_unused_configs = True [mypy-torch_npu.*] ignore_missing_imports = True +[mypy-torchair.*] +ignore_missing_imports = True + [mypy-transformers.*] ignore_missing_imports = True diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py index dc02c4b97e..f5ec2c872b 100644 --- a/tests/multicard/test_offline_inference_distributed.py +++ b/tests/multicard/test_offline_inference_distributed.py @@ -23,7 +23,7 @@ import os from unittest.mock import patch -import vllm # noqa: F401 +from modelscope import snapshot_download # type: ignore from vllm import SamplingParams from tests.conftest import VllmRunner @@ -95,3 +95,20 @@ def test_models_distributed_DeepSeek_dbo(): distributed_executor_backend="mp", ) as vllm_model: vllm_model.generate(example_prompts, sampling_params) + + +def test_models_distributed_DeepSeek_W8A8(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + + with VllmRunner( + snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"), + max_model_len=8192, + enforce_eager=True, + dtype="auto", + tensor_parallel_size=4, + quantization="ascend", + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py index 484fe5f702..818745f308 100644 --- a/tests/singlecard/test_ascend_config.py +++ b/tests/singlecard/test_ascend_config.py @@ -58,7 +58,7 @@ def test_run_with_ascend_config(): "use_cached_graph": True, "graph_batch_sizes": [1, 2, 4, 8], "graph_batch_sizes_init": False, - "enable_multistream_shared_expert": True, + "enable_multistream_moe": True, }, "ascend_scheduler_config": { "enabled": True, @@ -79,7 +79,7 @@ def test_run_with_ascend_config(): 1, 2, 4, 8 ] assert not ascend_config.torchair_graph_config.graph_batch_sizes_init - assert ascend_config.torchair_graph_config.enable_multistream_shared_expert + assert ascend_config.torchair_graph_config.enable_multistream_moe assert ascend_config.ascend_scheduler_config.enabled assert ascend_config.ascend_scheduler_config.enable_chunked_prefill assert ascend_config.expert_tensor_parallel_size == 1 diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 0c072c3a30..abb60392c1 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -54,8 +54,8 @@ def __init__(self, torchair_graph_config): "graph_batch_sizes", []) self.graph_batch_sizes_init = torchair_graph_config.get( "graph_batch_sizes_init", False) - self.enable_multistream_shared_expert = torchair_graph_config.get( - "enable_multistream_shared_expert", False) + self.enable_multistream_moe = torchair_graph_config.get( + "enable_multistream_moe", False) self.enable_view_optimize = torchair_graph_config.get( "enable_view_optimize", True) diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py index c3de6ae932..9db49cbff6 100644 --- a/vllm_ascend/models/deepseek_dbo.py +++ b/vllm_ascend/models/deepseek_dbo.py @@ -29,7 +29,7 @@ import torch import torch.distributed as dist -import torch_npu +import torch_npu # noqa: F401 import vllm.envs as envs from torch import nn from transformers import PretrainedConfig @@ -40,13 +40,10 @@ get_tp_group, tensor_model_parallel_all_reduce) from vllm.distributed.parallel_state import get_dp_group from vllm.forward_context import get_forward_context -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, ReplicatedLinear, - RowParallelLinear, - UnquantizedLinearMethod) + RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope @@ -67,6 +64,7 @@ import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2MLP from vllm_ascend.multistream.base import MSEventKey from vllm_ascend.multistream.context import ( advance_step_multistream_layer_context, get_multistream_comm_context, @@ -78,117 +76,17 @@ make_multistream_metadata_ds) from vllm_ascend.multistream.ms_split import compute_split_seq_index from vllm_ascend.ops.fused_moe import AscendFusedMoE -from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod from vllm_ascend.utils import dispose_tensor VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 -class CustomDeepseekDBOMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - quant_config: Optional[QuantizationConfig] = None, - reduce_results: bool = True, - prefix: str = "", - ) -> None: - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config, - reduce_results=reduce_results, - prefix=f"{prefix}.down_proj") - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - # NOTE: `torch_npu.npu_dequant_swiglu_quant` can only be enabled in dynamic quant - self.is_dynamic_quant = not isinstance( - self.gate_up_proj.quant_method, - UnquantizedLinearMethod) and isinstance( - self.gate_up_proj.quant_method.quant_method, - AscendW8A8DynamicLinearMethod) - - def forward(self, x): - if self.is_dynamic_quant: - x, dynamic_scale = torch_npu.npu_dynamic_quant(x) - x = torch_npu.npu_quant_matmul( - x, - self.gate_up_proj.weight, - self.gate_up_proj.weight_scale, - output_dtype=torch.int32, - ) - x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant( - x=x, - weight_scale=self.gate_up_proj.weight_scale_fp32, - activation_scale=dynamic_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=None, - activate_left=True, - quant_mode=1) - x = torch_npu.npu_quant_matmul( - x, - self.down_proj.weight, - self.down_proj.weight_scale, - pertoken_scale=dynamic_scale, - output_dtype=torch.bfloat16, - ) - if self.down_proj.reduce_results and self.down_proj.tp_size > 1: - x = tensor_model_parallel_all_reduce(x) - return x - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x +class CustomDeepseekDBOMLP(CustomDeepseekV2MLP): def _forward_ms_mlp(self, x): current_ms_metadata = get_multistream_comm_context() assert current_ms_metadata is not None - if self.is_dynamic_quant: - x, dynamic_scale = torch_npu.npu_dynamic_quant(x) - x = torch_npu.npu_quant_matmul( - x, - self.gate_up_proj.weight, - self.gate_up_proj.weight_scale, - output_dtype=torch.int32, - ) - x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant( - x=x, - weight_scale=self.gate_up_proj.weight_scale_fp32, - activation_scale=dynamic_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=None, - activate_left=True, - quant_mode=1) - x = torch_npu.npu_quant_matmul( - x, - self.down_proj.weight, - self.down_proj.weight_scale, - pertoken_scale=dynamic_scale, - output_dtype=torch.bfloat16, - ) - if self.down_proj.reduce_results and self.down_proj.tp_size > 1: - current_ms_metadata.before_comm_event.record() - with torch.npu.stream(current_ms_metadata.comm_stream): - current_ms_metadata.before_comm_event.wait() - x = tensor_model_parallel_all_reduce(x) - current_ms_metadata.after_comm_event.record() - return x gate_up, _ = self.gate_up_proj(x) x = self.act_fn(gate_up) current_ms_metadata.before_comm_event.record() diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 96c76338bb..a83ca47510 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -25,7 +25,7 @@ # # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py # """Inference-only DeepseekV2/DeepseekV3 model.""" -from typing import Any, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch import torch.distributed as dist @@ -69,12 +69,73 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_ep_group from vllm_ascend.ops.fused_moe import AscendFusedMoE +from vllm_ascend.quantization.quant_config import AscendLinearMethod from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod from vllm_ascend.utils import dispose_tensor VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 +class CustomDeepseekV2SiluAndMul(SiluAndMul): + + def __init__(self, + *, + weight_scale: Optional[Callable[[], torch.Tensor]] = None): + super().__init__() + self.weight_scale = weight_scale + + def forward_oot(self, x: Union[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]): + if isinstance(x, tuple): + assert self.weight_scale is not None + # For AscendW8A8DynamicLinearMethod: + # a dynamic scale is passed along with the quantized value. + quantized_x, dynamic_scale = x + return torch_npu.npu_dequant_swiglu_quant( + x=quantized_x, + weight_scale=self.weight_scale(), + activation_scale=dynamic_scale, + activate_left=True, + quant_mode=1) + else: + return super().forward_oot(x) + + +class CustomDeepseekV2MergedReplicatedLinear(ReplicatedLinear): + + def __init__( + self, + input_size: int, + output_sizes: list[int], + bias: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + self.output_sizes = output_sizes + super().__init__(input_size, + sum(output_sizes), + bias=bias, + quant_config=quant_config, + prefix=prefix) + + def weight_loader(self, param: torch.nn.Parameter, + loaded_weight: torch.Tensor, loaded_shard_id: int): + # With no support for GGUF format yet. + assert not getattr(param, "is_gguf_weight", False) + assert not getattr(param, "is_gguf_weight_type", False) + + assert loaded_shard_id < len(self.output_sizes) + shard_offset = sum(self.output_sizes[:loaded_shard_id]) + shard_size = self.output_sizes[loaded_shard_id] + shard = param.data.narrow(param.output_dim, shard_offset, shard_size) + + assert shard.size() == loaded_weight.size(), ( + f"Tried to load weights of size {loaded_weight.size()}" + f"to a parameter shard of id {loaded_shard_id} size {shard.size()}" + ) + shard.copy_(loaded_weight) + + class CustomDeepseekV2MLP(nn.Module): def __init__( @@ -84,61 +145,68 @@ def __init__( hidden_act: str, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True, + force_replicate: bool = False, prefix: str = "", ) -> None: super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config, - reduce_results=reduce_results, - prefix=f"{prefix}.down_proj") + if not force_replicate: + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") + else: + self.gate_up_proj = CustomDeepseekV2MergedReplicatedLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = ReplicatedLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") - self.act_fn = SiluAndMul() - # NOTE: `torch_npu.npu_dequant_swiglu_quant` can only be enabled in dynamic quant - self.is_dynamic_quant = not isinstance( - self.gate_up_proj.quant_method, - UnquantizedLinearMethod) and isinstance( - self.gate_up_proj.quant_method.quant_method, - AscendW8A8DynamicLinearMethod) + quant_method = self.gate_up_proj.quant_method + if isinstance(quant_method, UnquantizedLinearMethod): + self.act_fn = CustomDeepseekV2SiluAndMul() + elif (isinstance(quant_method, AscendLinearMethod) and isinstance( + quant_method.quant_method, AscendW8A8DynamicLinearMethod)): + # TODO(sdmyzlp): Currently preserved as before: + # 1. The only quantization supported for silu is W8A8Dynamic + # 2. Output dtype of gate_up/down is fixed to be int32/bfloat16 + # + # Maybe one can implement a better and more general configuration + # scheme, e.g. by somehow passing around the tweaked `quant_config` + self.act_fn = CustomDeepseekV2SiluAndMul( + # Use lazy binding, for `weight_scale_fp32` is accessible + # only after `process_weights_after_loading`. + weight_scale=lambda: self.gate_up_proj.weight_scale_fp32) + # To be consumed by AscendW8A8DynamicLinearMethod.apply() + self.gate_up_proj._ascend_quant_config = { + "output_dtype": torch.int32, + "pertoken_scale": False, + "return_scale": True, + } + self.down_proj._ascend_quant_config = { + "output_dtype": torch.bfloat16, + "pertoken_scale": True, + "return_scale": False, + } + else: + raise NotImplementedError( + f"Quantization with [{type(quant_method)}] is NOT supported") def forward(self, x): - if self.is_dynamic_quant: - x, dynamic_scale = torch_npu.npu_dynamic_quant(x) - x = torch_npu.npu_quant_matmul( - x, - self.gate_up_proj.weight, - self.gate_up_proj.weight_scale, - output_dtype=torch.int32, - ) - x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant( - x=x, - weight_scale=self.gate_up_proj.weight_scale_fp32, - activation_scale=dynamic_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=None, - activate_left=True, - quant_mode=1) - x = torch_npu.npu_quant_matmul( - x, - self.down_proj.weight, - self.down_proj.weight_scale, - pertoken_scale=dynamic_scale, - output_dtype=torch.bfloat16, - ) - if self.down_proj.reduce_results and self.down_proj.tp_size > 1: - x = tensor_model_parallel_all_reduce(x) - return x gate_up, _ = self.gate_up_proj(x) x = self.act_fn(gate_up) x, _ = self.down_proj(x) @@ -169,6 +237,12 @@ def __init__( raise ValueError(f"Unsupported activation: {config.hidden_act}. " "Only silu is supported for now.") + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on + self.enable_multistream_moe = \ + ascend_config.torchair_graph_config.enable_multistream_moe and VLLM_ENABLE_MC2 + self.gate = ReplicatedLinear(config.hidden_size, config.n_routed_experts, bias=False, @@ -204,8 +278,11 @@ def __init__( hidden_act=config.hidden_act, quant_config=quant_config, reduce_results=True, + force_replicate=self.enable_multistream_moe, prefix=f"{prefix}.shared_experts", ) + else: + self.shared_experts = None # type: ignore CustomDeepseekV2MoE.top_k = config.num_experts_per_tok self.dp_size = get_dp_group().world_size @@ -216,12 +293,6 @@ def __init__( self.params_dtype = torch.get_default_dtype() - ascend_config = get_ascend_config() - self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on - self.enable_multistream_shared_expert = \ - ascend_config.torchair_graph_config.enable_multistream_shared_expert and VLLM_ENABLE_MC2 - def forward( self, hidden_states: torch.Tensor, @@ -240,12 +311,10 @@ def forward( enable_force_load_balance = False if hasattr(attn_metadata, 'with_prefill_across_dp'): is_prefill = is_prefill or attn_metadata.with_prefill_across_dp - num_tokens, hidden_size = hidden_states.shape - - multistream = self.enable_multistream_shared_expert and not is_prefill - - old_hidden_states = hidden_states.clone() + old_hidden_states = hidden_states + use_separated_shared_experts = (self.shared_experts is not None + and not self.enable_multistream_moe) if self.tp_size > 1: if (VLLM_ENABLE_MC2 @@ -262,25 +331,22 @@ def forward( # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - kwargs = {} - if multistream: - kwargs.update({ - "shared_experts": self.shared_experts, - "shared_hidden_states": old_hidden_states - }) - - hidden_states = self.experts( + experts_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits, is_prefill=is_prefill, top_k=CustomDeepseekV2MoE.top_k, enable_force_load_balance=enable_force_load_balance, - **kwargs) - - if multistream: - hidden_states, shared_output = hidden_states + shared_experts=(self.shared_experts + if not use_separated_shared_experts else None), + ) - hidden_states = hidden_states * self.routed_scaling_factor + if not isinstance(experts_hidden_states, tuple): + hidden_states = experts_hidden_states * self.routed_scaling_factor + else: + hidden_states = ( + experts_hidden_states[0] * self.routed_scaling_factor + + experts_hidden_states[1]) if self.tp_size > 1: if (VLLM_ENABLE_MC2 @@ -294,12 +360,9 @@ def forward( else: hidden_states = tensor_model_parallel_all_reduce(hidden_states) - if self.n_shared_experts is not None: - if not multistream: - shared_output = self.shared_experts(old_hidden_states) - - if shared_output is not None: - hidden_states = hidden_states + shared_output + if use_separated_shared_experts: + hidden_states = hidden_states + self.shared_experts( + old_hidden_states) return hidden_states.view(num_tokens, hidden_size) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 25f3b05d59..d6115d35c1 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -16,7 +16,7 @@ # Adapted from vllm/tests/kernels/test_moe.py import os -from typing import Callable, List, Optional +from typing import Any, Callable, List, Optional, Tuple, Union import torch import torch.distributed as dist @@ -36,6 +36,7 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer +from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 USING_LCCL_COM: bool = envs_ascend.USING_LCCL_COM @@ -106,15 +107,17 @@ def process_topk_ids(topk_ids: torch.Tensor, expert_num: int, ep_size: int, return topk_ids_pad, unpad_indices -def fused_experts_with_mc2(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - top_k: int, - expert_map: torch.Tensor = None, - moe_all_to_all_group_name: Optional[str] = None, - **kwargs) -> torch.Tensor: +def fused_experts_with_mc2( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None, + moe_all_to_all_group_name: Optional[str] = None, + shared_experts: Optional[Any] = None +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: global_bs = 0 moe_expert_num = len(expert_map) kwargs_mc2 = { @@ -154,6 +157,13 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[ 0:5] + if shared_experts is not None: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(hidden_states, topk_weights) + shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states) + npu_wait_tensor(shared_gate_up, expand_x) + shared_act = shared_experts.act_fn(shared_gate_up) + w1 = w1.transpose(1, 2) group_list = expert_token_nums.to(torch.int64) @@ -210,7 +220,13 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2) - return hidden_states + if shared_experts is None: + return hidden_states + else: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(shared_act, down_out_list) + shared_hidden_states, _ = shared_experts.down_proj(shared_act) + return hidden_states, shared_hidden_states def apply_mlp(hidden_states_wrapper: List[torch.Tensor], @@ -875,6 +891,7 @@ def apply( e_score_correction_bias: Optional[torch.Tensor] = None, is_prefill: bool = False, enable_force_load_balance: bool = False, + shared_experts: Optional[Any] = None, **kwargs, ) -> torch.Tensor: @@ -924,7 +941,7 @@ def apply( top_k=top_k, expert_map=expert_map, moe_all_to_all_group_name=self.moe_all_to_all_group_name, - **kwargs) + shared_experts=shared_experts) elif self.torchair_graph_enabled or get_ep_group().world_size == 1: return fused_experts(hidden_states=x, w1=layer.w13_weight, @@ -1053,9 +1070,6 @@ def __init__( self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on - self.enable_multistream_shared_expert = \ - ascend_config.torchair_graph_config.enable_multistream_shared_expert and VLLM_ENABLE_MC2 if self.scoring_func != "softmax" and not self.use_grouped_topk: raise ValueError("Only softmax scoring function is supported for " @@ -1102,8 +1116,8 @@ def forward(self, router_logits: torch.Tensor, is_prefill: bool, enable_force_load_balance: bool = False, - top_k=None, - **kwargs): + top_k: Optional[int] = None, + shared_experts: Optional[Any] = None): assert self.quant_method is not None if top_k: @@ -1132,7 +1146,7 @@ def forward(self, hidden_states, router_logits) # Matrix multiply. - hidden_states = self.quant_method.apply( + e_hidden_states = self.quant_method.apply( layer=self, x=hidden_states, router_logits=router_logits, @@ -1150,36 +1164,39 @@ def forward(self, enable_force_load_balance=enable_force_load_balance, log2phy=self.log2phy, global_redundant_expert_num=self.global_redundant_expert_num, - **kwargs) + shared_experts=shared_experts, + ) - if self.enable_multistream_shared_expert and not is_prefill: - hidden_states, shared_output = hidden_states + if shared_experts is not None: + # Provide dummy implementation of "non-separated" shared experts. + if not isinstance(e_hidden_states, tuple): + return e_hidden_states, shared_experts(hidden_states) + else: + return e_hidden_states if self.dp_size > 1: if VLLM_ENABLE_MC2 and not is_prefill: ... elif self.torchair_graph_enabled: if USING_LCCL_COM: # type: ignore - hidden_states = dist._functional_collectives.reduce_scatter_tensor( - hidden_states, + e_hidden_states = dist._functional_collectives.reduce_scatter_tensor( + e_hidden_states, "sum", scatter_dim=0, group=get_dp_group().device_group) elif self.torchair_graph_enabled and not is_prefill: - hidden_states = dist._functional_collectives.reduce_scatter_tensor( - hidden_states, + e_hidden_states = dist._functional_collectives.reduce_scatter_tensor( + e_hidden_states, "sum", scatter_dim=0, group=get_dp_group().device_group) else: - hidden_states = get_ep_group().combine(hidden_states) + e_hidden_states = get_ep_group().combine(e_hidden_states) if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1): - hidden_states = tensor_model_parallel_all_reduce(hidden_states) + e_hidden_states = tensor_model_parallel_all_reduce(e_hidden_states) - if self.enable_multistream_shared_expert and not is_prefill: - return hidden_states, shared_output - return hidden_states + return e_hidden_states # ----------------------------------------- TBO-related -------------------------------------------- diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index c6e863ff71..66a0a302c8 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -15,19 +15,19 @@ # limitations under the License. # -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, Optional, Tuple, Union import torch import torch.distributed as dist import torch_npu -import torchair as tng # type: ignore -from vllm.distributed import GroupCoordinator, tensor_model_parallel_all_reduce +from vllm.distributed import GroupCoordinator import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_ep_group from vllm_ascend.ops.fused_moe import select_experts -from vllm_ascend.utils import dispose_tensor +from vllm_ascend.utils import (dispose_tensor, npu_stream_switch, + npu_wait_tensor) VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 @@ -39,8 +39,7 @@ def apply_mlp(hidden_states: torch.Tensor, w2_scale: torch.Tensor, group_list: torch.Tensor, dynamic_scale: torch.Tensor = None, - group_list_type: int = 1, - **kwargs) -> torch.Tensor: + group_list_type: int = 1) -> torch.Tensor: """ apply MLP: gate_up_proj -> swiglu -> down_proj @@ -74,23 +73,6 @@ def apply_mlp(hidden_states: torch.Tensor, else: pertoken_scale = dynamic_scale - shared_experts = kwargs.get('shared_experts', None) - if shared_experts: - shared_gate_up = kwargs.get('shared_gate_up', None) - shared_dynamic_scale = kwargs.get('shared_dynamic_scale', None) - with tng.scope.npu_stream_switch('cv'): - tng.scope.npu_wait_tensor(shared_gate_up, hidden_states) - shared_x, shared_dynamic_scale = torch_npu.npu_dequant_swiglu_quant( - x=shared_gate_up, - weight_scale=shared_experts.gate_up_proj.weight_scale_fp32, - activation_scale=shared_dynamic_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=None, - activate_left=True, - quant_mode=1) - # gmm1: gate_up_proj hidden_states = torch_npu.npu_grouped_matmul( x=[hidden_states], @@ -120,36 +102,24 @@ def apply_mlp(hidden_states: torch.Tensor, group_list=group_list, output_dtype=w2_scale.dtype)[0] - if shared_experts: - with tng.scope.npu_stream_switch('cv'): - tng.scope.npu_wait_tensor(shared_x, hidden_states) - shared_output = torch_npu.npu_quant_matmul( - shared_x, - shared_experts.down_proj.weight, - shared_experts.down_proj.weight_scale, - pertoken_scale=shared_dynamic_scale, - output_dtype=torch.bfloat16, - ) - if shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1: - shared_output = tensor_model_parallel_all_reduce(shared_output) - if shared_experts: - return hidden_states, shared_output return hidden_states -def fused_experts_with_mc2(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - top_k: int, - expert_map: torch.Tensor = None, - moe_all_to_all_group_name: str = "", - log2phy: torch.Tensor = None, - global_redundant_expert_num: int = 0, - **kwargs) -> torch.Tensor: +def fused_experts_with_mc2( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None, + moe_all_to_all_group_name: str = "", + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: if log2phy: topk_ids = log2phy[topk_ids] global_bs = 0 @@ -188,31 +158,17 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, } kwargs_mc2.update(stage1_kwargs) - shared_experts = kwargs.get('shared_experts', None) - if shared_experts: - shared_hidden_states = kwargs.get('shared_hidden_states', None) - with tng.scope.npu_stream_switch('cv'): - tng.scope.npu_wait_tensor(shared_hidden_states, hidden_states) - shared_x, shared_dynamic_scale = torch_npu.npu_dynamic_quant( - shared_hidden_states) - shared_gate_up = torch_npu.npu_quant_matmul( - shared_x, - shared_experts.gate_up_proj.weight, - shared_experts.gate_up_proj.weight_scale, - output_dtype=torch.int32, - ) - kwargs.update({ - "shared_gate_up": shared_gate_up, - "shared_dynamic_scale": shared_dynamic_scale, - }) - output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2) # comm_stream.wait_stream(torch.npu.current_stream()) expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[ 0:5] - if quant_mode == 0: - dynamic_scale = None + if shared_experts is not None: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(hidden_states, topk_weights) + shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states) + npu_wait_tensor(shared_gate_up[0], expand_x) + shared_act = shared_experts.act_fn(shared_gate_up) # `expand_x` will be disposed in the `apply_mlp` function down_out_list = apply_mlp(expand_x, @@ -221,12 +177,7 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, w2, w2_scale, expert_token_nums, - dynamic_scale=dynamic_scale, - **kwargs) - - multi_stream = isinstance(down_out_list, tuple) - if multi_stream: - down_out_list, shared_output = down_out_list + dynamic_scale=dynamic_scale) # moeCombine kwargs_mc2 = { @@ -257,9 +208,13 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2) - if multi_stream: + if shared_experts is None: + return hidden_states + else: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(shared_act[0], down_out_list) + shared_output, _ = shared_experts.down_proj(shared_act) return hidden_states, shared_output - return hidden_states # currently expert parallelism implemented with all2all @@ -541,21 +496,33 @@ def get_perchannel_param( @staticmethod def apply( layer: torch.nn.Module, - x: torch.Tensor, + x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], bias: Optional[torch.Tensor] = None, tp_rank: Optional[int] = 0, ) -> torch.Tensor: - original_dtype = x.dtype - # use ATB quantize - quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x) - return torch_npu.npu_quant_matmul( - quant_out, + config = getattr(layer, "_ascend_quant_config", {}) + if not isinstance(x, tuple): + output_dtype = config.get("output_dtype", x.dtype) + quantized_x, dynamic_scale = torch_npu.npu_dynamic_quant(x) + else: + assert "output_dtype" in config.keys(), ( + f"DynamicLinearMethod needs explicitly specified `output_dtype`" + f"for pre-quantized input, got config [{config}]") + output_dtype = config["output_dtype"] + quantized_x, dynamic_scale = x + pertoken_scale = (dynamic_scale + if config.get("pertoken_scale", True) else None) + + output = torch_npu.npu_quant_matmul( + quantized_x, layer.weight, layer.weight_scale, - pertoken_scale=dynamic_scale, + pertoken_scale=pertoken_scale, bias=bias, - output_dtype=original_dtype, + output_dtype=output_dtype, ) + return ((output, dynamic_scale) + if config.get("return_scale", False) else output) def process_weights_after_loading(self, layer): if self.transpose_weight: @@ -650,6 +617,7 @@ def apply( enable_force_load_balance: bool = True, log2phy: torch.Tensor = None, global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, **kwargs, ) -> torch.Tensor: assert router_logits.shape[ @@ -706,7 +674,7 @@ def apply( moe_all_to_all_group_name=self.moe_all_to_all_group_name, log2phy=log2phy, global_redundant_expert_num=global_redundant_expert_num, - **kwargs) + shared_experts=shared_experts) elif self.torchair_graph_enabled or self.ep_group.world_size == 1: return fused_experts(hidden_states=x, w1=layer.w13_weight, diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 7d40938049..f41dab4b94 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -19,17 +19,26 @@ import atexit import math -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext from threading import Lock from typing import TYPE_CHECKING, List, Tuple import torch +import torchair # type: ignore[import] # noqa: F401 from packaging.version import InvalidVersion, Version from torch_npu.npu.streams import Event from vllm.logger import logger import vllm_ascend.envs as envs +try: + # Recent release of torchair has moved these ops to `.scope`. + from torchair.scope import npu_stream_switch as _npu_stream_switch + from torchair.scope import npu_wait_tensor as _npu_wait_tensor +except ImportError: + from torchair.ops import NpuStreamSwitch as _npu_stream_switch + from torchair.ops import npu_wait_tensor as _npu_wait_tensor + if TYPE_CHECKING: from vllm.config import VllmConfig else: @@ -227,3 +236,14 @@ def pop_captured_sync(self) -> dict: durations[tag] = observe_start.elapsed_time(observe_end) return durations + + +def npu_stream_switch(tag: str, priority: int, *, enabled: bool = True): + return _npu_stream_switch(tag, priority) if enabled else nullcontext() + + +def npu_wait_tensor(self: torch.Tensor, + dependency: torch.Tensor, + *, + enabled: bool = True): + return _npu_wait_tensor(self, dependency) if enabled else self From 860a5ef7fd1930b18f28ac48d803ef7f6298e2e6 Mon Sep 17 00:00:00 2001 From: depeng1994 <166494784+depeng1994@users.noreply.github.com> Date: Wed, 11 Jun 2025 10:02:11 +0800 Subject: [PATCH 019/153] provide an e2e guide for execute duration profiling (#1113) ### What this PR does / why we need it? provide an e2e guide for execute duration profiling Signed-off-by: depeng1994 --- .../developer_guide/evaluation/profile_execute_duration.md | 5 +++++ vllm_ascend/worker/model_runner_v1.py | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/source/developer_guide/evaluation/profile_execute_duration.md b/docs/source/developer_guide/evaluation/profile_execute_duration.md index 8989bf9cf9..3b37df6beb 100644 --- a/docs/source/developer_guide/evaluation/profile_execute_duration.md +++ b/docs/source/developer_guide/evaluation/profile_execute_duration.md @@ -9,6 +9,11 @@ The execution duration of each stage (including pre/post-processing, model forwa * Use the non-blocking API `ProfileExecuteDuration().capture_async` to set observation points asynchronously when you need to observe the execution duration. * Use the blocking API `ProfileExecuteDuration().pop_captured_sync` at an appropriate time to get and print the execution durations of all observed stages. +**We have instrumented the key inference stages (including pre-processing, model forward pass, etc.) for execute duration profiling. Execute the script as follows:** +``` +VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE=1 python3 vllm-ascend/examples/offline_inference_npu.py +``` + ## Example Output ``` diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 51ffe1ec2d..9b0a860baa 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1317,8 +1317,8 @@ def execute_model( for tag, duration in durations.items() ] captured_name = "Decode" if self.attn_state == AscendAttentionState.DecodeOnly else "Prefill" - print(f"Profile execute duration [{captured_name}]:", - " ".join(dr_str)) + logger.info("Profile execute duration [%s]:%s", captured_name, + " ".join(dr_str)) return model_runner_output From 980cd814660791b1af7ab8b7e5d1c548359224e7 Mon Sep 17 00:00:00 2001 From: ttanzhiqiang <38750855+ttanzhiqiang@users.noreply.github.com> Date: Wed, 11 Jun 2025 10:40:50 +0800 Subject: [PATCH 020/153] etp best a2 (#1101) ### What this PR does / why we need it? Single machine 16 cards deepseekr1 attention (tp8/dp2) / moe(etp) Best performance rely on: vllm-ascend commit id:da9acfca6053352730fce75fb772e214755d0341 vllm commit id:b124e1085b1bf977e3dac96d99ffd9d8ddfdb6cc + https://github.com/vllm-project/vllm-ascend/pull/910 + [Reduce _npu_flash_attention mask to 128x128 for memory savings] https://github.com/vllm-project/vllm-ascend/pull/1100+ [Reduce memory usage by splitting tokens in fused_experts] --------- Signed-off-by: ttanzhiqiang <389825161@qq.com> --- examples/run_dp_attention_etp16.sh | 23 +++++++++ examples/run_dp_attention_etp16_benmark.sh | 56 ++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 examples/run_dp_attention_etp16.sh create mode 100644 examples/run_dp_attention_etp16_benmark.sh diff --git a/examples/run_dp_attention_etp16.sh b/examples/run_dp_attention_etp16.sh new file mode 100644 index 0000000000..b736492411 --- /dev/null +++ b/examples/run_dp_attention_etp16.sh @@ -0,0 +1,23 @@ +export VLLM_ENABLE_MC2=0 +export VLLM_USE_V1=1 +export TASK_QUEUE_ENABLE=1 +source /usr/local/Ascend/ascend-toolkit/set_env.sh +source /usr/local/Ascend/nnal/atb/set_env.sh +export ASCEND_LAUNCH_BLOCKING=0 +export VLLM_VERSION=0.9.0 + +nohup python -m vllm.entrypoints.openai.api_server --model=/mnt/deepseek/DeepSeek-R1-W8A8-VLLM \ + --quantization ascend \ + --trust-remote-code \ + --distributed-executor-backend=mp \ + --port 8006 \ + -tp=8 \ + -dp=2 \ + --max-num-seqs 24 \ + --max-model-len 32768 \ + --max-num-batched-tokens 32768 \ + --block-size 128 \ + --no-enable-prefix-caching \ + --additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes":[24]},"ascend_scheduler_config":{"enabled":true},"expert_tensor_parallel_size":16}' \ + --gpu-memory-utilization 0.96 &> run.log & +disown \ No newline at end of file diff --git a/examples/run_dp_attention_etp16_benmark.sh b/examples/run_dp_attention_etp16_benmark.sh new file mode 100644 index 0000000000..ab72b3b28d --- /dev/null +++ b/examples/run_dp_attention_etp16_benmark.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Concurrency array +concurrency_array=(48) +#best rate +rate_array=(0.7) + +# Result file +result_file="benchmark_results.txt" +echo "Benchmark Results" > $result_file +echo "===================" >> $result_file + +# Loop through all combinations +for concurrency in "${concurrency_array[@]}"; do + for rate in "${rate_array[@]}"; do + echo "Testing with concurrency=$concurrency, rate=$rate" + echo "" >> $result_file + echo "Concurrency: $concurrency, Request Rate: $rate" >> $result_file + echo "-------------------" >> $result_file + + # Run benchmark test + python /mnt/deepseek/vllm/benchmarks/benchmark_serving.py \ + --backend vllm \ + --trust-remote-code \ + --model /mnt/deepseek/DeepSeek-R1-W8A8-VLLM \ + --dataset-name random \ + --random-input-len 4096 \ + --random-output-len 1536 \ + --ignore-eos \ + --num-prompts 400 \ + --max-concurrency $concurrency \ + --request-rate $rate \ + --metric-percentiles 90 \ + --base-url http://localhost:8006 2>&1 | tee -a $result_file + + # Wait for system cool down + sleep 30 + done +done + +# Analyze results +echo "Analysis Results" > analysis_results.txt +echo "=================" >> analysis_results.txt + +# Extract and analyze TPOT data +echo "TPOT Analysis:" >> analysis_results.txt +grep "Mean TPOT" $result_file | awk -F':' '{ + printf "Concurrency %s, Rate %s: %s ms\n", $1, $2, $3 +}' >> analysis_results.txt + +# Extract and analyze throughput data +echo -e "\nThroughput Analysis:" >> analysis_results.txt +grep "Output token throughput" $result_file | awk -F':' '{ + printf "Concurrency %s, Rate %s: %s tokens/s\n", $1, $2, $3 +}' >> analysis_results.txt + +echo "Testing completed. Results saved in $result_file and analysis in analysis_results.txt" From 4153a5091b698c2270d160409e7fee73baaf701b Mon Sep 17 00:00:00 2001 From: yz <43207690+yzim@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:03:37 +0800 Subject: [PATCH 021/153] [Doc] Fix the config parameter name "enable" in graph_mode.md. (#1159) Fix the doc typo in graph_mode.md Signed-off-by: yzim <43207690+yzim@users.noreply.github.com> --- docs/source/user_guide/graph_mode.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md index 126978f95b..6831809f22 100644 --- a/docs/source/user_guide/graph_mode.md +++ b/docs/source/user_guide/graph_mode.md @@ -47,14 +47,14 @@ from vllm import LLM os.environ["VLLM_USE_V1"] = 1 -model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enable": True}}) +model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True}}) outputs = model.generate("Hello, how are you?") ``` online example: ```shell -vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enable": true}}' +vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enabled": true}}' ``` You can find more detail about additional config [here](./additional_config.md) From e46dc142bf1180453c64226d76854fc1ec696169 Mon Sep 17 00:00:00 2001 From: chenwaner <48718746+chenwaner@users.noreply.github.com> Date: Wed, 11 Jun 2025 14:09:28 +0800 Subject: [PATCH 022/153] Enable kvcache_nz for the decode process in torchair graph mode (#1098) What this PR does / why we need it? Enable kvcache_nz for the decode process in torchair graph mode, which reduces the time consumed by FA in long sequences. Does this PR introduce any user-facing change? If need to enable kvcache_nz, should set the additional_config.torchair_graph_config.enable_kv_nz=True How was this patch tested? 1. Tested in deepseek model: with batchsize 64 and seq_len 1k+3k, 61 layers FA total time improves 20.80ms -> 19.76ms 2. operator precision test: [aclnnFusedInferAttentionScoreV3_result.csv](https://github.com/user-attachments/files/20664138/aclnnFusedInferAttentionScoreV3_result.csv) 3. tpot test from @ttanzhiqiang, and curl one result is normal https://github.com/vllm-project/vllm-ascend/pull/1098#issuecomment-2948542159 https://github.com/vllm-project/vllm-ascend/pull/1098#issuecomment-2954496588 --------- Signed-off-by: chenwaner <861645847@qq.com> --- docs/source/user_guide/additional_config.md | 4 +- vllm_ascend/ascend_config.py | 1 + vllm_ascend/attention/mla_v1.py | 138 +++++++++++++------- 3 files changed, 96 insertions(+), 47 deletions(-) diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index b769a31ab9..90002dbb91 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -44,6 +44,7 @@ The details of each config option are as follows: | `use_cached_graph` | bool | `False` | Whether to use cached graph | | `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache | | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty | +| `enable_kv_nz`| bool | `False` | Whether to enable kvcache NZ layout | **ascend_scheduler_config** @@ -64,7 +65,8 @@ A full example of additional configuration is as follows: "use_cached_graph": true, "graph_batch_sizes": [1, 2, 4, 8], "graph_batch_sizes_init": false, - "enable_multistream_moe": false + "enable_multistream_moe": false, + "enable_kv_nz": false }, "ascend_scheduler_config": { "enabled": true, diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index abb60392c1..f7441f5f82 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -58,6 +58,7 @@ def __init__(self, torchair_graph_config): "enable_multistream_moe", False) self.enable_view_optimize = torchair_graph_config.get( "enable_view_optimize", True) + self.enable_kv_nz = torchair_graph_config.get("enable_kv_nz", False) if not isinstance(self.graph_batch_sizes, list): raise TypeError("graph_batch_sizes must be list[int]") diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 7f603422c8..9e10815fbe 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -480,6 +480,7 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz # Adapt torch air graph mode with spec decoding. speculative_config = get_current_vllm_config().speculative_config if speculative_config is not None: @@ -662,6 +663,7 @@ def exec_kv( kv = self.kv_a_proj_with_mqa(hidden_states)[0] # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) + cache_mode = "PA_NZ" if self.enable_kv_nz else "PA" k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache( kv, self.kv_a_layernorm.weight, @@ -671,7 +673,37 @@ def exec_kv( kv_cache[1], kv_cache[0], epsilon=self.kv_a_layernorm.variance_epsilon, - cache_mode="PA", + cache_mode=cache_mode, + ) + return k_pe, k_nope + + def exec_kv_prefill( + self, + hidden_states: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + kv_cache: Tuple, + slots: torch.Tensor, + ): + + B = hidden_states.shape[0] + N = self.num_kv_heads + S = 1 + kv = self.kv_a_proj_with_mqa(hidden_states)[0] + # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] + kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) + cache_mode = "PA_BLK_NZ" if self.enable_kv_nz else "PA" + _, _, k_pe, k_nope = torch_npu.npu_kv_rmsnorm_rope_cache( + kv, + self.kv_a_layernorm.weight, + cos, + sin, + slots.to(torch.int64), + kv_cache[1], + kv_cache[0], + epsilon=self.kv_a_layernorm.variance_epsilon, + cache_mode=cache_mode, + is_output_kv=True, ) return k_pe, k_nope @@ -709,34 +741,42 @@ def _forward_decode( # TorchAir's shape is [bs, num_heads_per_rank, q_seq_len, dim] if attn_metadata.attn_state == AscendAttentionState.SpecDecoding: assert num_tokens % self.spec_token_num == 0 - q_nope = (q_nope.view( - num_tokens // (self.spec_token_num + 1), - self.spec_token_num + 1, - self.num_heads, - -1, - ).transpose(1, 2).contiguous()) - q_pe = (q_pe.view( - num_tokens // (self.spec_token_num + 1), - self.spec_token_num + 1, - self.num_heads, - -1, - ).transpose(1, 2).contiguous()) + q_nope = q_nope.view(num_tokens // (self.spec_token_num + 1), + self.spec_token_num + 1, self.num_heads, + -1) + q_pe = q_pe.view(num_tokens // (self.spec_token_num + 1), + self.spec_token_num + 1, self.num_heads, -1) + if not self.enable_kv_nz: + q_nope = q_nope.transpose(1, 2).contiguous() + q_pe = q_pe.transpose(1, 2).contiguous() sparse_mode = 3 spec_attn_mask = attn_metadata.decode.attn_mask # type:ignore else: - q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1) - q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1) + if self.enable_kv_nz: + q_nope = q_nope.view(num_tokens, 1, self.num_heads, -1) + q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1) + else: + q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1) + q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1) sparse_mode = 0 spec_attn_mask = None # shape of knope/k_pe for npu graph mode should be: # [num_blocks, num_kv_heads, block_size, self.kv_lora_rank/self.qk_rope_head_dim] block_size = kv_c_and_k_pe_cache[0].shape[1] - k_nope = k_nope.view(-1, self.num_kv_heads, block_size, - self.kv_lora_rank) - k_pe = k_pe.view(-1, self.num_kv_heads, block_size, - self.qk_rope_head_dim) + if self.enable_kv_nz: + k_nope = k_nope.view(-1, self.num_kv_heads, + self.kv_lora_rank // 16, block_size, 16) + k_pe = k_pe.view(-1, self.num_kv_heads, + self.qk_rope_head_dim // 16, block_size, 16) + input_layout = "BSND" + else: + k_nope = k_nope.view(-1, self.num_kv_heads, block_size, + self.kv_lora_rank) + k_pe = k_pe.view(-1, self.num_kv_heads, block_size, + self.qk_rope_head_dim) + input_layout = "BNSD" - attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score( + attn_output, _ = torch_npu.npu_fused_infer_attention_score( q_nope, k_nope, k_nope, @@ -744,7 +784,7 @@ def _forward_decode( key_rope=k_pe, num_heads=self.num_heads, num_key_value_heads=self.num_kv_heads, - input_layout="BNSD", + input_layout=input_layout, atten_mask=spec_attn_mask, sparse_mode=sparse_mode, scale=self.scale, @@ -793,10 +833,11 @@ def forward( ] num_actual_toks = attn_metadata.num_actual_tokens if k_pe is None and not self.running_in_graph: - kv_c, k_pe = self.kv_a_proj_with_mqa( - hidden_states_or_kv_c_normed)[0].split( - [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) + if not self.torchair_graph_enabled: + kv_c, k_pe = self.kv_a_proj_with_mqa( + hidden_states_or_kv_c_normed)[0].split( + [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) else: kv_c_normed = hidden_states_or_kv_c_normed assert attn_metadata.num_decodes is not None and \ @@ -809,16 +850,18 @@ def forward( # Inputs and outputs may be padded for CUDA graphs output_padded = output output = output[:num_actual_toks, ...] - kv_c_normed = kv_c_normed[:num_actual_toks, ...] - prefill_k_c_normed = kv_c_normed[num_decode_tokens:] + if not self.torchair_graph_enabled: + kv_c_normed = kv_c_normed[:num_actual_toks, ...] + prefill_k_c_normed = kv_c_normed[num_decode_tokens:] if not self.running_in_graph: hidden_states_or_q_c = hidden_states_or_q_c[:num_actual_toks, ...] - decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens] prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:] - k_pe = k_pe[:num_actual_toks, ...] - k_pe = k_pe.unsqueeze(1) - decode_k_pe = k_pe[:num_decode_tokens] - prefill_k_pe = k_pe[num_decode_tokens:] + if not self.torchair_graph_enabled: + decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens] + k_pe = k_pe[:num_actual_toks, ...] + k_pe = k_pe.unsqueeze(1) + decode_k_pe = k_pe[:num_decode_tokens] + prefill_k_pe = k_pe[num_decode_tokens:] else: decode_hs_or_q_c = hidden_states_or_q_c if has_decode: @@ -855,22 +898,25 @@ def forward( prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim] if self.torchair_graph_enabled: num_tokens = prefill_hs_or_q_c.shape[0] + seq_len = self.rotary_emb.max_position_embeddings + cos = self.rotary_emb.cos_cached[:seq_len].to( + dtype=prefill_q_pe.dtype) + sin = self.rotary_emb.sin_cached[:seq_len].to( + dtype=prefill_q_pe.dtype) + cos = cos[attn_metadata.prefill.input_positions] + sin = sin[attn_metadata.prefill.input_positions] + cos = cos[:, None, None, :] + sin = sin[:, None, None, :] + + prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin) + prefill_k_pe, prefill_k_nope = self.exec_kv_prefill( + hidden_states_or_kv_c_normed, cos, sin, kv_cache, + attn_metadata.slot_mapping) + + kv_c_normed = prefill_k_nope[:num_actual_toks, ...] + prefill_k_c_normed = prefill_k_nope[num_decode_tokens:] prefill_k_pe = prefill_k_pe.view(num_tokens, self.num_kv_heads, -1) - if self.rotary_emb.__class__.__name__ == 'RotaryEmbedding': - # NOTE: When scaling not specified - ori_q_pe_shape, ori_k_pe_shape = prefill_q_pe.shape, prefill_k_pe.shape - prefill_q_pe = prefill_q_pe.reshape(num_tokens, -1) - prefill_k_pe = prefill_k_pe.reshape(num_tokens, -1) - prefill_q_pe, prefill_k_pe = self.rotary_emb( - attn_metadata.prefill.input_positions, prefill_q_pe, - prefill_k_pe) - prefill_q_pe = prefill_q_pe.view(ori_q_pe_shape) - prefill_k_pe = prefill_k_pe.view(ori_k_pe_shape) - else: - prefill_q_pe, prefill_k_pe = self.rotary_emb( - attn_metadata.prefill.input_positions, prefill_q_pe, - prefill_k_pe) prefill_q = torch.cat([prefill_q_nope, prefill_q_pe], dim=-1) else: prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb( From 4f5964420e271b2ed7e6cd89cbad2bb66b169114 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Wed, 11 Jun 2025 16:33:11 +0800 Subject: [PATCH 023/153] [CI] Upgrade vllm to 0.9.1 (#1165) 1. upgrade vllm to 0.9.1. 0.9.0 is not supported for main branch now. keep doc to 0.9.0 until we release the first 0.9.1 release. 2. disable V0 test for PR 3. move actionlint check to lint job Signed-off-by: wangxiyuan --- .github/workflows/accuracy_test.yaml | 5 +- .github/workflows/actionlint.yml | 53 -------- .github/workflows/nightly_benchmarks.yaml | 2 +- .github/workflows/vllm_ascend_test.yaml | 13 +- .../workflows/vllm_ascend_test_long_term.yaml | 2 +- .github/workflows/vllm_ascend_test_pd.yaml | 2 +- Dockerfile | 2 +- Dockerfile.openEuler | 2 +- tests/singlecard/compile/test_simple.py | 32 ++--- tests/singlecard/test_scheduler.py | 61 +++------ vllm_ascend/compilation/piecewise_backend.py | 8 +- vllm_ascend/core/scheduler.py | 19 +-- vllm_ascend/patch/__init__.py | 15 +-- vllm_ascend/patch/platform/__init__.py | 4 +- .../platform/patch_0_9_0/patch_distributed.py | 116 ------------------ .../patch_0_9_1}/__init__.py | 0 vllm_ascend/patch/worker/__init__.py | 4 +- .../patch_0_9_1}/__init__.py | 1 - vllm_ascend/worker/model_runner_v1.py | 51 +++----- 19 files changed, 72 insertions(+), 320 deletions(-) delete mode 100644 .github/workflows/actionlint.yml delete mode 100644 vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py rename vllm_ascend/patch/{worker/patch_0_9_0 => platform/patch_0_9_1}/__init__.py (100%) rename vllm_ascend/patch/{platform/patch_0_9_0 => worker/patch_0_9_1}/__init__.py (90%) diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index d1a61230ef..999fb6ad5d 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -34,8 +34,7 @@ on: # Current supported vLLM versions options: - main - - v0.9.0.1 - - v0.9.0 + - v0.9.1 - v0.7.3 vllm-ascend-version: description: 'vllm-ascend version:' @@ -159,7 +158,7 @@ jobs: repository: vllm-project/vllm path: ./vllm-empty # Please also update this when bump matched version - ref: ${{ github.event.inputs.vllm-version || 'v0.9.0' }} + ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }} - name: Install vllm-project/vllm from source working-directory: ./vllm-empty diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml deleted file mode 100644 index 91cd9c4120..0000000000 --- a/.github/workflows/actionlint.yml +++ /dev/null @@ -1,53 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: Lint GitHub Actions workflows -on: - pull_request: - branches: - - 'main' - - '*-dev' - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run actionlint" - env: - SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086 - run: | - echo "::add-matcher::.github/workflows/matchers/actionlint.json" - tools/actionlint.sh -color diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index da4dbccd92..6ee1b45ed8 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -50,7 +50,7 @@ jobs: strategy: matrix: include: - - vllm_branch: v0.9.0 + - vllm_branch: v0.9.1 vllm_ascend_branch: main container: image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index b02350234c..073058d28f 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -33,6 +33,9 @@ on: - '!benchmarks/**' - 'tools/mypy.sh' - 'mypy.ini' + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -87,6 +90,13 @@ jobs: repository: vllm-project/vllm path: vllm-empty + - name: Actionlint Check + env: + SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086 + run: | + echo "::add-matcher::.github/workflows/matchers/actionlint.json" + tools/actionlint.sh -color + - name: Install vllm-project/vllm from source working-directory: vllm-empty run: | @@ -105,7 +115,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [main, v0.9.0] + vllm_version: [main, v0.9.1] concurrency: group: > ${{ @@ -193,6 +203,7 @@ jobs: fi - name: Run vllm-project/vllm-ascend test on V0 engine + if: ${{ github.event_name == 'schedule' }} env: VLLM_USE_V1: 0 run: | diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index 2cc89172fc..c17200a8d1 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -43,7 +43,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [main, v0.9.0] + vllm_version: [main, v0.9.1] name: vLLM Ascend long term test runs-on: ${{ matrix.os }} container: diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml index 7548b079e6..c2c76c9f23 100644 --- a/.github/workflows/vllm_ascend_test_pd.yaml +++ b/.github/workflows/vllm_ascend_test_pd.yaml @@ -41,7 +41,7 @@ jobs: if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }} strategy: matrix: - vllm_verison: [main, v0.9.0] + vllm_verison: [main, v0.9.1] name: vLLM Ascend prefilling decoding disaggregation test runs-on: linux-arm64-npu-static-8 diff --git a/Dockerfile b/Dockerfile index 1dfd10c861..952e77fe92 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.0 +ARG VLLM_TAG=v0.9.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index ffd1174d22..2ff3d0b39a 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.0 +ARG VLLM_TAG=v0.9.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/tests/singlecard/compile/test_simple.py b/tests/singlecard/compile/test_simple.py index 64d4cba672..70b89297ab 100644 --- a/tests/singlecard/compile/test_simple.py +++ b/tests/singlecard/compile/test_simple.py @@ -14,8 +14,6 @@ set_current_vllm_config) from vllm.utils import direct_register_custom_op -from vllm_ascend.utils import vllm_version_is - global_counter = 0 # create a library to hold the custom op @@ -93,28 +91,14 @@ def test_simple_piecewise_compile(): model = SillyModel(vllm_config=vllm_config, prefix="") inputs = torch.randn(100).npu() - - if vllm_version_is("0.9.0"): - kwargs = { - "num_graphs_seen": 1, # one graph for the model - "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 - "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers - "num_backend_compilations": - 3, # num_piecewise_capturable_graphs_seen - "num_cudagraph_caputured": - 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - } - else: - kwargs = { - "num_graphs_seen": 1, # one graph for the model - "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 - "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers - "num_backend_compilations": - 3, # num_piecewise_capturable_graphs_seen - "num_cudagraph_captured": - 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - } - + kwargs = { + "num_graphs_seen": 1, # one graph for the model + "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 + "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers + "num_backend_compilations": 3, # num_piecewise_capturable_graphs_seen + "num_cudagraph_captured": + 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + } with compilation_counter.expect(kwargs): model(inputs) diff --git a/tests/singlecard/test_scheduler.py b/tests/singlecard/test_scheduler.py index d1c6062783..8021f0306c 100644 --- a/tests/singlecard/test_scheduler.py +++ b/tests/singlecard/test_scheduler.py @@ -31,7 +31,6 @@ from vllm.v1.structured_output import StructuredOutputManager from vllm_ascend.core.scheduler import AscendScheduler -from vllm_ascend.utils import vllm_version_is EOS_TOKEN_ID = 50256 @@ -87,27 +86,15 @@ def create_scheduler( vllm_config = VllmConfig(scheduler_config=scheduler_config, model_config=model_config, cache_config=cache_config) - - if vllm_version_is("0.9.0"): - kv_cache_config = KVCacheConfig( - num_blocks=10000, # A large number of blocks to hold all requests - tensors={}, - kv_cache_groups=[ - KVCacheGroupSpec(['layer'], - FullAttentionSpec(16, 1, 1, torch.float32, - False)) - ], - ) - else: - kv_cache_config = KVCacheConfig( - num_blocks=10000, # A large number of blocks to hold all requests - kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])], - kv_cache_groups=[ - KVCacheGroupSpec(['layer'], - FullAttentionSpec(16, 1, 1, torch.float32, - False, None)) - ], - ) + kv_cache_config = KVCacheConfig( + num_blocks=10000, # A large number of blocks to hold all requests + kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])], + kv_cache_groups=[ + KVCacheGroupSpec(['layer'], + FullAttentionSpec(16, 1, 1, torch.float32, False, + None)) + ], + ) cache_config.num_gpu_blocks = 10000 return AscendScheduler( vllm_config, @@ -135,27 +122,15 @@ def create_requests(num_requests: int, else: mm_position = None mm_inputs = None - if vllm_version_is("0.9.0"): - request = Request( - request_id=f"{i}", - prompt_token_ids=[i] * num_tokens, - sampling_params=sampling_params, - multi_modal_inputs=mm_inputs, - multi_modal_placeholders=mm_position, - multi_modal_hashes=None, - arrival_time=0, - eos_token_id=EOS_TOKEN_ID, - ) - else: - request = Request( - request_id=f"{i}", - prompt_token_ids=[i] * num_tokens, - sampling_params=sampling_params, - multi_modal_inputs=mm_inputs, - multi_modal_placeholders=mm_position, - multi_modal_hashes=None, - eos_token_id=EOS_TOKEN_ID, - ) + request = Request( + request_id=f"{i}", + prompt_token_ids=[i] * num_tokens, + sampling_params=sampling_params, + multi_modal_inputs=mm_inputs, + multi_modal_placeholders=mm_position, + multi_modal_hashes=None, + eos_token_id=EOS_TOKEN_ID, + ) requests.append(request) return requests diff --git a/vllm_ascend/compilation/piecewise_backend.py b/vllm_ascend/compilation/piecewise_backend.py index 95ce693933..c6a800b3d8 100644 --- a/vllm_ascend/compilation/piecewise_backend.py +++ b/vllm_ascend/compilation/piecewise_backend.py @@ -31,8 +31,6 @@ from vllm.logger import logger from vllm.utils import weak_ref_tensors -from vllm_ascend.utils import vllm_version_is - @dataclasses.dataclass class ConcreteSizeEntry: @@ -206,11 +204,7 @@ def __call__(self, *args) -> Any: # to save memory entry.output = weak_ref_tensors(output) entry.aclgraph = aclgraph - - if vllm_version_is("0.9.0"): - compilation_counter.num_cudagraph_caputured += 1 - else: - compilation_counter.num_cudagraph_captured += 1 + compilation_counter.num_cudagraph_captured += 1 # important: we need to return the output, rather than # the weak ref of the output, so that pytorch can correctly diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index 42f5d9c69a..05c663feb5 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -29,8 +29,6 @@ from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from vllm_ascend.utils import vllm_version_is - class AscendScheduler(Scheduler): """This Scheduler extends vllm's original v1 scheduler @@ -129,12 +127,7 @@ def skip_cur_request(): continue assert num_new_tokens > 0 - - if vllm_version_is("0.9.0"): - blocks = computed_blocks.blocks - else: - blocks = computed_blocks.blocks[0] - + blocks = computed_blocks.blocks[0] watermark = getattr(self.scheduler_config, "watermark", 0.01) if not self._check_watermark_for_prefill(request, num_new_tokens, blocks, watermark): @@ -330,14 +323,8 @@ def _check_watermark_for_prefill(self, len(computed_blocks) * self.block_size) num_required_blocks = cdiv(num_new_tokens + num_computed_tokens, self.block_size) - - if vllm_version_is("0.9.0"): - req_blocks = self.kv_cache_manager.single_type_manager.req_to_blocks[ - request.request_id] - else: - req_blocks = self.kv_cache_manager.coordinator.get_blocks( - request.request_id) - + req_blocks = self.kv_cache_manager.coordinator.get_blocks( + request.request_id) num_new_blocks = (num_required_blocks - len(req_blocks) - len(computed_blocks)) num_evictable_computed_blocks = sum(1 for blk in computed_blocks diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 4be92c23a4..3c24bfc70f 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -24,9 +24,9 @@ # each worker's `__init__` function. # # Then in each kind of patch, there are three folders: -# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0. +# - patch_0_9_1: contains the patches applied when vllm version is 0.9.1. # - patch_main: contains the patches applied when vllm version is main branch. -# - patch_common: contains the patches applied in both 0.9.0 and main branch. +# - patch_common: contains the patches applied in both 0.9.1 and main branch. # # Once a new patch is added in vllm-ascend, please add the patch description into this file as well. # ---------------------------------------------------------------------------------- @@ -35,17 +35,6 @@ # -------------------------------- # * Platform Patch: # ================= -# ** File: platform/patch_0_9_0/patch_distributed.py** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.distributed.utils.stateless_init_torch_distributed_process_group()` -# Why: -# vllm distributed use gloo backend by default to initialize stateless process group, but we want to use hccl here -# How: -# Add hccl backend to the `stateless_init_torch_distributed_process_group` -# Related PR (if no, explain why): -# https://github.com/vllm-project/vllm/pull/18763 -# Future Plan: -# Remove this patch once vllm is upgraded to 0.9.1 # ** File: platform/patch_common/patch_distributed.py** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.distributed.parallel_state.destroy_model_parallel()` diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index e724fe5314..4ec38e38d0 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -17,8 +17,8 @@ from vllm_ascend.utils import vllm_version_is # Import specific patches for different versions -if vllm_version_is("0.9.0"): - from vllm_ascend.patch.platform import patch_0_9_0 # noqa: F401 +if vllm_version_is("0.9.1"): + from vllm_ascend.patch.platform import patch_0_9_1 # noqa: F401 from vllm_ascend.patch.platform import patch_common # noqa: F401 else: from vllm_ascend.patch.platform import patch_common # noqa: F401 diff --git a/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py b/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py deleted file mode 100644 index d468326bd3..0000000000 --- a/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py +++ /dev/null @@ -1,116 +0,0 @@ -import torch -from torch.distributed import ProcessGroup -from torch.distributed.distributed_c10d import (Backend, PrefixStore, - _get_default_timeout, - is_nccl_available) -from torch.distributed.rendezvous import rendezvous -from vllm.distributed import utils - - -def stateless_init_torch_distributed_process_group( - host: str, port: int, rank: int, world_size: int, - backend: str) -> ProcessGroup: - """ - A replacement for `torch.distributed.init_process_group` that does not - pollute the global state. The created ProcessGroup object can be used for - some operations such as `allreduce`, because it does not depend on the - global rank. However, some operations such as `broadcast` cannot be used - because it depends on the global rank. - - # TODO: ask for help from PyTorch team if we need the `broadcast` operation. - - This function is useful when we are not sure about the total number of - processes in the process group. For example, we may have process - 1, 2, ..., 8 who want to communicate, and process 9 might be the same - process as process 1, or it might be a different process; process 10 - might be the same process as process 5, or it might be a different process. - In this case, how can we reliably form a communication channel within - process 9 and 10, without affecting the communication channel within - process 1, 2, ..., 8? - - One possible solution is to figure out if process 9 and 10 are the same - as process 1 and 5 beforehand, and then form a communication channel - based on the information, adjusting the ranks and world_size etc. However, - figuring out the information is not always easy, and it will interfere - with the main communication channel. - - Our solution is to always form a communication channel with process 1, 2, - ..., 8, and then use this function to form another communication channel - with process 9 and 10. This way, regardless of whether process 9 and 10 - are the same as process 1 and 5, the main communication channel is - always formed with process 1, 2, ..., 8, and the additional communication - channel is formed with process 9 and 10. - """ - init_method = f"tcp://{host}:{port}" - backend = Backend(backend) # it is basically string - timeout = _get_default_timeout(backend) - - store, rank, world_size = next( - rendezvous(init_method, rank, world_size, timeout=timeout)) - store.set_timeout(timeout) - - group_rank = rank - group_size = world_size - - # Use a PrefixStore to avoid accidental overrides of keys used by - # different systems (e.g. RPC) in case the store is multi-tenant. - prefix_store = PrefixStore(init_method, store) - - # TODO(Yizhou): The reason we need to set options while vllm does not - # seems to be related to the version of PyTorch. In the latest version, - # there is no need to set options. While in the older version, 2.5.1 - # specifically, we need to set options. - options = ProcessGroup.Options(backend=backend) - pg: ProcessGroup = ProcessGroup( - prefix_store, - group_rank, - group_size, - options, - ) - if backend == "gloo": - from torch.distributed.distributed_c10d import ProcessGroupGloo - backend_class = ProcessGroupGloo(prefix_store, - group_rank, - group_size, - timeout=timeout) - backend_type = ProcessGroup.BackendType.GLOO - device = torch.device("cpu") - elif backend == "nccl": - assert is_nccl_available() - from torch.distributed.distributed_c10d import ProcessGroupNCCL - - backend_options = ProcessGroupNCCL.Options() - backend_options._timeout = timeout - - backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size, - backend_options) - backend_type = ProcessGroup.BackendType.NCCL - device = torch.device("cuda") - elif backend == "hccl": - from torch.distributed import is_hccl_available - assert is_hccl_available() - from torch_npu._C._distributed_c10d import ProcessGroupHCCL - backend_options = ProcessGroupHCCL.Options() - backend_options._timeout = timeout - backend_class = ProcessGroupHCCL(prefix_store, group_rank, group_size, - backend_options) - device = torch.device("npu") - backend_class._set_sequence_number_for_group() - backend_type = ProcessGroup.BackendType.CUSTOM - pg._register_backend(device, backend_type, backend_class) - return pg - else: - raise RuntimeError(f"Unsupported torch distributed backend: {backend}") - - # TODO(Yizhou): Like we mentioned above, _set_default_backend is not - # implemented in the 2.5.1 version of PyTorch. But we need to set it - # after the latest version is released. - # pg._set_default_backend(backend_type) - backend_class._set_sequence_number_for_group() - - pg._register_backend(device, backend_type, backend_class) - - return pg - - -utils.stateless_init_torch_distributed_process_group = stateless_init_torch_distributed_process_group diff --git a/vllm_ascend/patch/worker/patch_0_9_0/__init__.py b/vllm_ascend/patch/platform/patch_0_9_1/__init__.py similarity index 100% rename from vllm_ascend/patch/worker/patch_0_9_0/__init__.py rename to vllm_ascend/patch/platform/patch_0_9_1/__init__.py diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index d1d3d42f4e..3b29856d22 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -18,8 +18,8 @@ from vllm_ascend.utils import vllm_version_is # Import specific patches for different versions -if vllm_version_is("0.9.0"): - from vllm_ascend.patch.worker import patch_0_9_0 # noqa: F401 +if vllm_version_is("0.9.1"): + from vllm_ascend.patch.worker import patch_0_9_1 # noqa: F401 from vllm_ascend.patch.worker import patch_common # noqa: F401 else: from vllm_ascend.patch.worker import patch_common # noqa: F401 diff --git a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py b/vllm_ascend/patch/worker/patch_0_9_1/__init__.py similarity index 90% rename from vllm_ascend/patch/platform/patch_0_9_0/__init__.py rename to vllm_ascend/patch/worker/patch_0_9_1/__init__.py index f0ac16236a..116c73c06c 100644 --- a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py +++ b/vllm_ascend/patch/worker/patch_0_9_1/__init__.py @@ -14,4 +14,3 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import vllm_ascend.patch.platform.patch_0_9_0.patch_distributed # noqa diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 9b0a860baa..c3587936f9 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -74,7 +74,7 @@ from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata from vllm_ascend.platform import NPUPlatform from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler -from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is +from vllm_ascend.utils import ProfileExecuteDuration from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer if TYPE_CHECKING: @@ -1614,44 +1614,27 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: import torch_npu kv_caches: Dict[str, torch.Tensor] = {} - # Remove this after we drop 0.9.0 support - if vllm_version_is("0.9.0"): - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.model_config.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=True, - vocab_size=self.model_config.get_vocab_size(), - block_size=self.cache_config.block_size, - ) - else: - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.model_config.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=True, - vocab_size=self.model_config.get_vocab_size(), - block_sizes=[self.cache_config.block_size], - ) + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.model_config.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=True, + vocab_size=self.model_config.get_vocab_size(), + block_sizes=[self.cache_config.block_size], + ) - if not vllm_version_is("0.9.0"): - kv_cache_sizes = {} - for kv_cache_tensor in kv_cache_config.kv_cache_tensors: - assert len(kv_cache_tensor.shared_by) == 1, ( - "KV cache tensor shared by multiple layers is not supported in " - "NPU.") - kv_cache_sizes[ - kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size + kv_cache_sizes = {} + for kv_cache_tensor in kv_cache_config.kv_cache_tensors: + assert len(kv_cache_tensor.shared_by) == 1, ( + "KV cache tensor shared by multiple layers is not supported in " + "NPU.") + kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size for kv_cache_group in kv_cache_config.kv_cache_groups: kv_cache_spec = kv_cache_group.kv_cache_spec for layer_name in kv_cache_group.layer_names: - if vllm_version_is("0.9.0"): - tensor_size = kv_cache_config.tensors[layer_name].size - else: - tensor_size = kv_cache_sizes[layer_name] + tensor_size = kv_cache_sizes[layer_name] assert tensor_size % kv_cache_spec.page_size_bytes == 0 num_blocks = tensor_size // kv_cache_spec.page_size_bytes From afcce8e4fac934fac2f1188a2950674c2e591e62 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Wed, 11 Jun 2025 17:23:48 +0800 Subject: [PATCH 024/153] fix bugs in fused_experts_with_all2all --- vllm_ascend/eplb/core/worker/eplb_worker.py | 2 +- vllm_ascend/quantization/w8a8_dynamic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 581a509c96..30e3919720 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -101,7 +101,7 @@ def compose_expert_update_info(self, updated_expert_maps, current_expert_maps): if dst_rank_id not in expert_recv_info_this_layer: expert_recv_info_this_layer[dst_rank_id] = [] - if not torch.isin(src_rank_indices, torch.tensor(expert_id)).any(): + if not torch.isin(torch.tensor(expert_id), experts_to_send).any(): # if expert_id are not sent out from any npu, it will be copied from one npu holding this expert candidate_src_rank_indices = torch.where(current_expert_maps_this_layer[:, expert_id] != -1) else: diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 049b87f025..d3f3c4a2df 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -278,7 +278,7 @@ def fused_experts_with_all2all( log2phy: torch.Tensor = None, global_redundant_expert_num: int = 0, ): - if log2phy: + if log2phy is not None: topk_ids = log2phy[topk_ids] original_shape = hidden_states.shape if len(original_shape) == 3: From cc88ea763fdf0a311b2e6e5984083f94ea1c96b4 Mon Sep 17 00:00:00 2001 From: NPUDev Date: Wed, 11 Jun 2025 19:38:52 +0800 Subject: [PATCH 025/153] add eplb tabel generator --- vllm_ascend/eplb/tool/generate_map.py | 65 +++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 vllm_ascend/eplb/tool/generate_map.py diff --git a/vllm_ascend/eplb/tool/generate_map.py b/vllm_ascend/eplb/tool/generate_map.py new file mode 100644 index 0000000000..b498e73a06 --- /dev/null +++ b/vllm_ascend/eplb/tool/generate_map.py @@ -0,0 +1,65 @@ +import numpy as np +import json +import argparse + + +def split_and_insert(n, k, m): + ''' + n: expert num + k: card num + m: redundant expert num, make sure m%k==0 + ''' + + A = np.arange(n) + + B = np.random.choice(n, size=m, replace=False) + + groups = np.array_split(A, k) + + for j in range(m // k): + for i in range(k): + groups[i] = np.append(groups[i], B[i + j * k]) + return np.concatenate(groups) + + +def random_generation(n_layer=58, n_expert=256, start_layer_idx=0, device_count=128, n_redundant=128, output_name=""): + expert_data = {} + expert_data["moe_layer_count"] = n_layer + layer_list = [] + for i in range(n_layer): + layer = {"layer_id": start_layer_idx + i, "device_count": device_count} + random_placement = split_and_insert(n_expert, device_count, n_redundant) + device_list = [] + step = random_placement.shape[0] // device_count + for j in range(device_count): + device = {} + device["device_id"] = j + device["device_expert"] = random_placement[j * step: (j + 1) * step].tolist() + device_list.append(device) + layer["device_list"] = device_list + layer_list.append(layer) + + expert_data["layer_list"] = layer_list + json_file_path = output_name + + with open(json_file_path, "w") as f: + json.dump(expert_data, f, indent=4) + + print(f"JSON file generated: {json_file_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="python generate_map.py --n_layers 2 --n_experts 256 --card_num 8 --n_redundant 8 --output expert_map.json") + parser.add_argument("--n_layers", type=int, required=True) + parser.add_argument("--n_experts", type=int, required=True) + parser.add_argument("--card_num", type=int, required=True) + parser.add_argument("--n_redundant", type=int, default=0) + parser.add_argument("--output", type=str, default="expert_map.json") + args = parser.parse_args() + + n_layers = args.n_layers + n_experts = args.n_experts + card_num = args.card_num + n_redundant = args.n_redundant + output = args.output + + random_generation(n_layers, n_experts, 0, card_num, n_redundant, output) From 78079a78b0af38dff4a1d568566fbfe635b66bfe Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Wed, 11 Jun 2025 20:19:40 +0800 Subject: [PATCH 026/153] Adapt static EPLB --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 51 ++++++++++++++++++- .../core/loader/device_transfer_loader.py | 11 +++- vllm_ascend/eplb/eplb_updator.py | 2 + vllm_ascend/models/deepseek_v2.py | 14 ++--- vllm_ascend/ops/fused_moe.py | 12 ++++- 5 files changed, 77 insertions(+), 13 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 531ff205eb..b4afdfdfee 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -43,6 +43,11 @@ def __init__(self, model, **args): self.expert_map_per_layer[self.num_dense_layers + layer_idx] =\ self.model.get_expert_map(self.num_dense_layers + layer_idx) + self.log2phy_map_per_layer = dict() + for layer_idx in range(self.num_moe_layers): + self.log2phy_map_per_layer[self.num_dense_layers + layer_idx] =\ + self.model.get_log2phy_map(self.num_dense_layers + layer_idx) + def init_buffer_tensor_dict(self, num_buffer_tensor): for name in self.expert_weight_names: complete_name = "model.layers." + str(self.num_dense_layers) + ".mlp.experts." + name @@ -88,4 +93,48 @@ def do_update_expert_weight(self, layer_id, expert_id_before_replace, buffer_ten complete_name = "model.layers." + str(layer_id) + ".mlp.experts." + name local_expert_id = self.expert_map_per_layer[layer_id][expert_id_before_replace].item() expert_tensor = self.param_dict[complete_name].data[local_expert_id] - expert_tensor.copy_(self.buffer_tensor_dict[name][buffer_tensor_id]) \ No newline at end of file + expert_tensor.copy_(self.buffer_tensor_dict[name][buffer_tensor_id]) + + def generate_index_dicts(self,tensor_2d): + dict_list = [] + current_idx = 0 + + for row in tensor_2d: + value_to_index = {} + for i in range(row.size(0)): + value = row[i].item() + value_to_index[value] = current_idx + i + dict_list.append(value_to_index) + current_idx += row.size(0) + + return dict_list + + def generate_log2phy_map(self, expert_map): + ranks_num, global_expert_num = expert_map.shape + concatenated = torch.flatten(expert_map) + rank_expert_to_global = self.generate_index_dicts( + expert_map) + result_dict: Dict[int, List[int]] = {} + for idx, value in enumerate(concatenated): + key = value.item() + if key not in result_dict: + result_dict[key] = [] + result_dict[key].append(idx) + + log2phy_map = torch.full((ranks_num, global_expert_num), + -1, + dtype=torch.int32) + for rank in range(ranks_num): + for key in result_dict: + indices_in_concat = result_dict[key] + if key in rank_expert_to_global[rank]: + log2phy_map[rank][key] = rank_expert_to_global[rank][key] + else: + chosen_index = random.choice(indices_in_concat) + log2phy_map[rank][key] = chosen_index + return log2phy_map + + def do_update_log2phy_map(self, layer_id, updated_log2phy_map): + + if self.log2phy_map_per_layer[layer_id] is not None: + self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map) \ No newline at end of file diff --git a/vllm_ascend/eplb/core/loader/device_transfer_loader.py b/vllm_ascend/eplb/core/loader/device_transfer_loader.py index ea48798670..17cc1500e3 100644 --- a/vllm_ascend/eplb/core/loader/device_transfer_loader.py +++ b/vllm_ascend/eplb/core/loader/device_transfer_loader.py @@ -33,6 +33,7 @@ def __init__(self, eplb_adaptor): self.eplb_adaptor = eplb_adaptor self.updated_expert_map = None + self.updated_log2phy_map = None self.layer_id = -1 # layer id to be updated self.state = ExpertWeightUpdateState.WAITING self.recv_expert_list = [] @@ -68,6 +69,9 @@ def generate_expert_d2d_transfer_task(self, expert_send_info, expert_recv_info, self.state = ExpertWeightUpdateState.READY + def generate_log2phy_map(self,expert_map): + self.updated_log2phy_map = self.eplb_adaptor.generate_log2phy_map(expert_map) + def asyn_expert_weight_transfer(self, reqs): # Only when send/recv tasks are parsed into self.comm_op_list, d2d send/recv tasks can be luanched if self.state != ExpertWeightUpdateState.READY: @@ -94,6 +98,9 @@ def update_expert_map_and_weight(self, reqs): # update expert_map self.eplb_adaptor.do_update_expert_map(self.layer_id, self.updated_expert_map) + #update log2phy_map + self.eplb_adaptor.do_update_log2phy_map(self.layer_id, self.updated_log2phy_map) + # update expert weight buffer_tensor_id = 0 for recv_expert_info in self.recv_expert_list: @@ -138,4 +145,6 @@ def generate_mock_update_info(self, rank_id): return (expert_send_info, expert_recv_info, updated_expert_map, layer_id) def load_impl(self, old_expert_table, new_expert_table): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError + + \ No newline at end of file diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 99cd42255b..5cc4d57bd7 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -89,9 +89,11 @@ def forward_before(self): if self.update_in_flight and self.weight_update_counter < self.num_moe_layers: (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.block_update_queue.get() rank_id = torch.distributed.get_rank() + self.eplb_loader.generate_log2phy_map(updated_expert_map) expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] # TODO: layer_id + 3 should be replaced by configuration + logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") self.eplb_loader.generate_expert_d2d_transfer_task(expert_send_info_this_rank, expert_recv_info_this_rank, updated_expert_map[rank_id], layer_id + 3) self.weight_update_counter += 1 diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index e48df0508a..2ac69b9b18 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -723,18 +723,12 @@ def forward( inputs_embeds) return hidden_states - def update_expert_map(self,new_expert_map,layer_id): - self.model.layers[layer_id].mlp.experts.update_map(new_expert_map) - - def update_all_expert_map(self, new_expert_map,num_moe_layers): - num_moe_layers = len(new_expert_map) - for layer_id in range(num_moe_layers): - layer_map = new_expert_map[layer_id].to("npu") - self.model.layers[3+layer_id].mlp.experts.update_map(layer_map) - def get_expert_map(self,layer_id): return self.model.layers[layer_id].mlp.experts.get_map() + def get_log2phy_map(self,layer_id): + return self.model.layers[layer_id].mlp.experts.get_log2phy_map() + def get_all_expert_map(self,num_moe_layers): all_loads = [] for layer_id in range(num_moe_layers): @@ -752,7 +746,7 @@ def get_all_moe_loads(self, num_moe_layers) -> torch.Tensor: """ all_loads = [] for layer_id in range(num_moe_layers): - load_tensor = self.get_moe_load(3+layer_id) # (num_experts_per_layer,) + load_tensor = self.get_moe_load(3+layer_id) all_loads.append(load_tensor) return torch.stack(all_loads, dim=0) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 3a391653fb..438f15b7bf 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1030,6 +1030,11 @@ def __init__( self.log2phy = None self.global_redundant_expert_num = 0 + # test log2phy + self.log2phy = torch.full((self.ep_size, self.global_num_experts), + -1, + dtype=torch.int32) + ascend_config = get_ascend_config() expert_map_path = ascend_config.expert_map_path if expert_map_path and os.path.exists(expert_map_path): @@ -1154,6 +1159,8 @@ def forward(self, global_redundant_expert_num=self.global_redundant_expert_num, **kwargs) + print("log2phy:", self.log2phy) + # print("expert_map:", self.expert_map) self.calculate_moe_load() if self.enable_multistream_shared_expert and not is_prefill: @@ -1220,6 +1227,9 @@ def update_map(self,new_expert_map): def get_map(self): return self.expert_map + def get_log2phy_map(self): + return self.log2phy + def get_moe_load(self): return self.moe_load @@ -1234,4 +1244,4 @@ def calculate_moe_load(self): ones = torch.ones_like(ids, dtype=torch.int64, device=ids.device) self.moe_load.scatter_add_(0, ids, ones) - return self.moe_load \ No newline at end of file + return self.moe_load From 485e3d0228c8b9e70b879728ca01f0d2383c6311 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Wed, 11 Jun 2025 20:34:59 +0800 Subject: [PATCH 027/153] add enable_eplb in ascend_config --- vllm_ascend/ascend_config.py | 1 + vllm_ascend/worker/model_runner_v1.py | 10 +--------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 0c072c3a30..ca29859f4d 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -39,6 +39,7 @@ def __init__(self, vllm_config): self.expert_tensor_parallel_size = int( additional_config.get("expert_tensor_parallel_size", 0)) self.expert_map_path = additional_config.get("expert_map_path", None) + self.enable_eplb = additional_config.get("enable_eplb", False) class TorchairGraphConfig: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index d749d5bb7d..a8173c3c40 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -368,15 +368,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.dp_rank = vllm_config.parallel_config.data_parallel_rank #EPLB - self.enable_eplb = True - # if additional_config: - # self.enable_torchair_graph_mode = additional_config.get( - # "enable_graph_mode", - # False) and self.vllm_config.model_config.use_mla - # self.use_cached_npu_graph = additional_config.get( - # "use_cached_npu_graph", False) - # self.enable_eplb = additional_config.get("enable_eplb", False) - + self.enable_eplb = ascend_config.enable_eplb if self.enable_eplb == True: self.eplb_adaptor = None self.eplb_updator = EplbUpdator() From 3393d53b365e81ee5a547f4332b21381056c2822 Mon Sep 17 00:00:00 2001 From: whx <56632993+whx-sjtu@users.noreply.github.com> Date: Wed, 11 Jun 2025 20:55:44 +0800 Subject: [PATCH 028/153] [Scheduler][MTP] Add support for speculative decoding in AsecendScheduler. (#943) This PR adds support for speculative decoding in AsecendScheduler. Also inculde part of support for disaggregated prefill, full support will be merged in follow-up PR. --------- Signed-off-by: whx-sjtu <2952154980@qq.com> --- .github/workflows/vllm_ascend_test.yaml | 15 +- tests/singlecard/core/__init__.py | 0 .../singlecard/core/test_ascend_scheduler.py | 792 ++++++++++++++++++ .../core/test_ascend_scheduler_e2e.py | 40 + vllm_ascend/core/scheduler.py | 205 ++++- 5 files changed, 1002 insertions(+), 50 deletions(-) create mode 100644 tests/singlecard/core/__init__.py create mode 100644 tests/singlecard/core/test_ascend_scheduler.py create mode 100644 tests/singlecard/core/test_ascend_scheduler_e2e.py diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 073058d28f..30fbf0d5e3 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -180,18 +180,20 @@ jobs: run: | if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py - pytest -sv tests/singlecard/test_scheduler.py # guided decoding doesn't work, fix it later # pytest -sv tests/singlecard/test_guided_decoding.py.py # test_ascend_config.py should be ran separately because it will regenerate the global config many times. pytest -sv tests/singlecard/test_ascend_config.py pytest -sv tests/singlecard/test_camem.py + # pytest -sv tests/singlecard/core/test_ascend_scheduler.py + # pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py pytest -sv tests/singlecard/ \ --ignore=tests/singlecard/test_offline_inference.py \ - --ignore=tests/singlecard/test_scheduler.py \ --ignore=tests/singlecard/test_guided_decoding.py \ --ignore=tests/singlecard/test_ascend_config.py \ - --ignore=tests/singlecard/test_camem.py + --ignore=tests/singlecard/test_camem.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py else pytest -sv tests/multicard/test_ilama_lora_tp2.py # To avoid oom, we need to run the test in a single process. @@ -209,20 +211,21 @@ jobs: run: | if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py - pytest -sv tests/singlecard/test_scheduler.py # guided decoding doesn't work, fix it later # pytest -sv tests/singlecard/test_guided_decoding.py.py pytest -sv tests/singlecard/test_camem.py # test_ascend_config.py should be ran separately because it will regenerate the global config many times. pytest -sv tests/singlecard/test_ascend_config.py pytest -sv tests/singlecard/test_prompt_embedding.py + pytest -sv tests/singlecard/core/test_ascend_scheduler.py pytest -sv tests/singlecard/ \ --ignore=tests/singlecard/test_offline_inference.py \ - --ignore=tests/singlecard/test_scheduler.py \ --ignore=tests/singlecard/test_guided_decoding.py \ --ignore=tests/singlecard/test_camem.py \ --ignore=tests/singlecard/test_ascend_config.py \ - --ignore=tests/singlecard/test_prompt_embedding.py + --ignore=tests/singlecard/test_prompt_embedding.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py else pytest -sv tests/multicard/test_ilama_lora_tp2.py # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error. diff --git a/tests/singlecard/core/__init__.py b/tests/singlecard/core/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/singlecard/core/test_ascend_scheduler.py b/tests/singlecard/core/test_ascend_scheduler.py new file mode 100644 index 0000000000..c382ebdf40 --- /dev/null +++ b/tests/singlecard/core/test_ascend_scheduler.py @@ -0,0 +1,792 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import pytest +import torch +from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, + SchedulerConfig, SpeculativeConfig, VllmConfig) +from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange +from vllm.sampling_params import SamplingParams +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec) +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.request import Request, RequestStatus +from vllm.v1.structured_output import StructuredOutputManager + +from vllm_ascend.core.scheduler import AscendScheduler +from vllm_ascend.utils import vllm_version_is + +EOS_TOKEN_ID = 50256 + + +def create_scheduler( + model: str = "Qwen/Qwen2.5-0.5B-Instruct", + max_num_seqs: int = 16, + max_num_batched_tokens: int = 8192, + enable_prefix_caching: Optional[bool] = None, + long_prefill_token_threshold: int = 0, + disable_chunked_mm_input: bool = False, + use_kv_connector: bool = False, + num_blocks: int = 10000, + block_size: int = 16, + max_model_len: Optional[int] = None, + num_speculative_tokens: Optional[int] = None, + enable_chunked_prefill: bool = False, +) -> AscendScheduler: + '''Create scheduler under test. + + Args: + model: model under test + max_num_seqs: max sequences to schedule + max_num_batch_tokens: max num tokens to batch + enable_prefix_caching: optionally force APC config + (True/False) or use default + (None) + + Returns: + {class}`Scheduler` instance + ''' + if max_model_len is None: + max_model_len = max_num_batched_tokens + scheduler_config = SchedulerConfig( + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + max_model_len=max_model_len, + long_prefill_token_threshold=long_prefill_token_threshold, + disable_chunked_mm_input=disable_chunked_mm_input, + enable_chunked_prefill=enable_chunked_prefill, + ) + model_config = ModelConfig( + model=model, + task="auto", + tokenizer=model, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="float16", + seed=42, + ) + # Cache config, optionally force APC + kwargs_cache = ({} if enable_prefix_caching is None else { + 'enable_prefix_caching': enable_prefix_caching + }) + cache_config = CacheConfig( + block_size=block_size, + gpu_memory_utilization=0.9, + swap_space=0, + cache_dtype="auto", + **kwargs_cache, + ) + kv_transfer_config = KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={"shared_storage_path": "local_storage"}, + ) if use_kv_connector else None + + speculative_config: Optional[SpeculativeConfig] = None + if num_speculative_tokens is not None: + speculative_config = SpeculativeConfig( + model="ngram", num_speculative_tokens=num_speculative_tokens) + + vllm_config = VllmConfig( + scheduler_config=scheduler_config, + model_config=model_config, + cache_config=cache_config, + kv_transfer_config=kv_transfer_config, + speculative_config=speculative_config, + ) + kv_cache_config = KVCacheConfig( + num_blocks=num_blocks, # A large number of blocks to hold all requests + **({ + "tensors": {} + } if vllm_version_is("0.9.0") else { + "kv_cache_tensors": [] + }), + kv_cache_groups=[ + KVCacheGroupSpec(['layer'], + FullAttentionSpec(block_size, 1, 1, torch.float32, + False)) + ], + ) + cache_config.num_gpu_blocks = num_blocks + return AscendScheduler( + vllm_config=vllm_config, + kv_cache_config=kv_cache_config, + log_stats=True, + structured_output_manager=StructuredOutputManager(vllm_config), + ) + + +def create_requests(num_requests: int, + num_tokens: int = 10, + mm_positions: Optional[list[PlaceholderRange]] = None, + max_tokens: int = 16, + stop_token_ids: Optional[list[int]] = None, + prompt_logprobs: Optional[int] = None): + sampling_params = SamplingParams(ignore_eos=False, + max_tokens=max_tokens, + stop_token_ids=stop_token_ids, + prompt_logprobs=prompt_logprobs) + requests = [] + for i in range(num_requests): + if mm_positions is not None: + mm_position = mm_positions[i] + mm_inputs = [MultiModalKwargs({})] * len(mm_position) + else: + mm_position = None + mm_inputs = None + request = Request( + request_id=f"{i}", + prompt_token_ids=[i] * num_tokens, + sampling_params=sampling_params, + multi_modal_inputs=mm_inputs, + multi_modal_placeholders=mm_position, + multi_modal_hashes=None, + eos_token_id=EOS_TOKEN_ID, + **({ + "arrival_time": 0.0 + } if vllm_version_is("0.9.0") else {}), + ) + requests.append(request) + return requests + + +def test_add_requests(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + + for i, request in enumerate(requests): + scheduler.add_request(request) + assert request.request_id in scheduler.requests + assert len(scheduler.waiting) == i + 1 + + +def test_finish_request(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + for i, request in enumerate(requests): + scheduler.finish_requests(request.request_id, + RequestStatus.FINISHED_ABORTED) + assert request.request_id not in scheduler.requests + assert len(scheduler.waiting) == 9 - i + + +def test_get_num_unfinished_requests(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + for i, request in enumerate(requests): + scheduler.finish_requests(request.request_id, + RequestStatus.FINISHED_STOPPED) + assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1 + + +@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [ + (None, None), + (True, 5), +]) +def test_schedule(enable_prefix_caching: Optional[bool], + prompt_logprobs: Optional[int]): + '''Test scheduling. + Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs + ''' + scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching) + requests = create_requests(num_requests=10, + prompt_logprobs=prompt_logprobs) + for request in requests: + scheduler.add_request(request) + + # Test initial scheduling + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == len(requests) + assert len(output.scheduled_cached_reqs) == 0 + assert len(output.finished_req_ids) == 0 + # Verify all requests are scheduled. + for req_id, num_tokens in output.num_scheduled_tokens.items(): + assert num_tokens == len(requests[int(req_id)].prompt_token_ids) + + # Verify requests moved from waiting to running + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == len(requests) + for i, request in enumerate(requests): + assert scheduler.running[i] == request + + +@pytest.mark.parametrize("enable_prefix_caching", [True, False]) +def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): + """Test scheduling behavior with concurrent partial requests. + + This test verifies that: there are multiple long prefill requests in the + RUNNING state, and we can schedule them together. + + """ + scheduler = create_scheduler( + model="facebook/opt-125m", + max_num_batched_tokens=1024, + long_prefill_token_threshold=400, + enable_prefix_caching=enable_prefix_caching, + enable_chunked_prefill=True, + ) + requests = create_requests( + num_requests=3, + num_tokens=800, + ) + for request in requests: + scheduler.add_request(request) + + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == 3 + assert len(output.scheduled_cached_reqs) == 0 + assert len(output.finished_req_ids) == 0 + + # The first request is scheduled partially - 400. + assert output.num_scheduled_tokens[requests[0].request_id] == 400 + # The second request is scheduled partially - 400. + assert output.num_scheduled_tokens[requests[1].request_id] == 400 + # The third request is also scheduled partially - 1024 - 400 - 400 = 224. + assert output.num_scheduled_tokens[requests[2].request_id] == 224 + req_to_index = { + request.request_id: i + for i, request in enumerate(requests) + } + model_runner_output = ModelRunnerOutput( + req_ids=[request.request_id for request in requests], + req_id_to_index=req_to_index, + sampled_token_ids=[[] for _ in range(len(requests))], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + scheduler.update_from_output(output, model_runner_output) + + # Schedule the next step. All three requests are running. + # Processed the remaining prefills of the first and second requests. + output1 = scheduler.schedule() + assert len(scheduler.running) == 3 + assert len(output1.scheduled_new_reqs) == 0 + assert len(output1.scheduled_cached_reqs) == 3 + assert len(output1.finished_req_ids) == 0 + assert output1.num_scheduled_tokens[requests[0].request_id] == 400 + assert output1.num_scheduled_tokens[requests[1].request_id] == 400 + assert output1.num_scheduled_tokens[requests[2].request_id] == 224 + + # Schedule the third step. All three requests are running. + # First and second requests are in the decode stage. + # All the remaining tokens in the third request are processed. + model_runner_output = ModelRunnerOutput( + req_ids=[request.request_id for request in requests], + req_id_to_index=req_to_index, + sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + scheduler.update_from_output(output1, model_runner_output) + output2 = scheduler.schedule() + assert len(scheduler.running) == 3 + assert len(output2.scheduled_new_reqs) == 0 + assert len(output2.scheduled_cached_reqs) == 3 + assert len(output2.finished_req_ids) == 0 + assert output2.num_scheduled_tokens[requests[0].request_id] == 1 + assert output2.num_scheduled_tokens[requests[1].request_id] == 1 + assert output2.num_scheduled_tokens[ + requests[2].request_id] == 800 - 224 - 224 + + +def test_stop_via_update_from_output(): + """Test stopping behavior through update_from_output""" + scheduler = create_scheduler(num_speculative_tokens=1) + + # Test case 1: Stop on EOS token + requests = create_requests(num_requests=2, max_tokens=10) + for req in requests: + req.num_computed_tokens = req.num_tokens + scheduler.requests[req.request_id] = req + scheduler.running.append(req) + + scheduler_output = SchedulerOutput(scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 1, + requests[1].request_id: 2 + }, + total_num_scheduled_tokens=3, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [], + requests[1].request_id: [10] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={req.request_id: i + for i, req in enumerate(requests)}, + sampled_token_ids=[[EOS_TOKEN_ID], + [10, + 11]], # First request hits EOS, second continues + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify first request stopped, second continues + assert len(scheduler.running) == 1 + assert scheduler.running[0].request_id == requests[1].request_id + assert requests[0].status == RequestStatus.FINISHED_STOPPED + assert requests[0].request_id in scheduler.finished_req_ids + assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID] + assert list(requests[1].output_token_ids) == [10, 11] + + # Test case 2: Stop on custom stop token + scheduler = create_scheduler(num_speculative_tokens=2) + requests = create_requests(num_requests=2, + max_tokens=10, + stop_token_ids=[42, 43]) + for req in requests: + req.num_computed_tokens = req.num_tokens + scheduler.requests[req.request_id] = req + scheduler.running.append(req) + + scheduler_output = SchedulerOutput(scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 3, + requests[1].request_id: 2 + }, + total_num_scheduled_tokens=5, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [10, 42], + requests[1].request_id: [13] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={req.request_id: i + for i, req in enumerate(requests)}, + sampled_token_ids=[[10, 42, 12], + [13, 14]], # First request hits stop token + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify first request stopped on custom token + assert len(scheduler.running) == 1 + assert scheduler.running[0].request_id == requests[1].request_id + assert requests[0].status == RequestStatus.FINISHED_STOPPED + assert requests[0].stop_reason == 42 + assert requests[0].request_id in scheduler.finished_req_ids + assert list(requests[0].output_token_ids) == [10, 42] + assert list(requests[1].output_token_ids) == [13, 14] + + # Test case 3: Stop on max tokens + scheduler = create_scheduler(num_speculative_tokens=2) + requests = create_requests(num_requests=2, max_tokens=2) + for req in requests: + req.num_computed_tokens = req.num_tokens + scheduler.requests[req.request_id] = req + scheduler.running.append(req) + + scheduler_output = SchedulerOutput(scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 3, + requests[1].request_id: 1 + }, + total_num_scheduled_tokens=4, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [10, 11], + requests[1].request_id: [] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={req.request_id: i + for i, req in enumerate(requests)}, + sampled_token_ids=[[10, 11, 12], + [13]], # First request exceeds max_tokens + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify first request stopped due to length + assert len(scheduler.running) == 1 + assert scheduler.running[0].request_id == requests[1].request_id + assert requests[0].status == RequestStatus.FINISHED_LENGTH_CAPPED + assert requests[0].request_id in scheduler.finished_req_ids + assert list(requests[0].output_token_ids) == [10, 11 + ] # Truncated to max_tokens + assert list(requests[1].output_token_ids) == [13] + + # Test case 4: Ignore EOS flag + scheduler = create_scheduler(num_speculative_tokens=2) + requests = create_requests(num_requests=1, max_tokens=10) + requests[0].sampling_params.ignore_eos = True + requests[0].num_computed_tokens = requests[0].num_tokens + scheduler.requests[requests[0].request_id] = requests[0] + scheduler.running.append(requests[0]) + + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={requests[0].request_id: 3}, + total_num_scheduled_tokens=3, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [EOS_TOKEN_ID, 10] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + + model_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify request continues past EOS + assert len(scheduler.running) == 1 + assert not requests[0].is_finished() + assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11] + + +@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [ + (None, None), + (True, 5), +]) +def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool], + prompt_logprobs: Optional[int]): + scheduler = create_scheduler( + max_num_batched_tokens=1024, + max_num_seqs=2, + enable_prefix_caching=enable_prefix_caching, + enable_chunked_prefill=True, + ) + requests = create_requests( + num_requests=2, + num_tokens=512, + prompt_logprobs=prompt_logprobs, + ) + + # Schedule the first request. + scheduler.add_request(requests[0]) + scheduler_output0 = scheduler.schedule() + assert len(scheduler_output0.scheduled_new_reqs) == 1 + assert scheduler_output0.num_scheduled_tokens[ + requests[0].request_id] == 512 + + # The first request is still running, so only schedule the second request. + scheduler.add_request(requests[1]) + scheduler_output1 = scheduler.schedule() + assert len(scheduler_output1.scheduled_new_reqs) == 1 + assert scheduler_output1.num_scheduled_tokens[ + requests[1].request_id] == 512 + + # Model output of the first request. + model_runner_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[0]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + scheduler.update_from_output(scheduler_output0, model_runner_output) + + # Schedule the next step. + # The first request can be scheduled again while the second + # request is still running. + scheduler_output2 = scheduler.schedule() + assert scheduler_output2.num_scheduled_tokens[requests[0].request_id] == 1 + + # Model output of the second request. + model_runner_output = ModelRunnerOutput( + req_ids=[requests[1].request_id], + req_id_to_index={requests[1].request_id: 0}, + sampled_token_ids=[[0]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + scheduler.update_from_output(scheduler_output1, model_runner_output) + + +# Note - these test cases mirror some of those in test_rejection_sampler.py +@pytest.mark.parametrize( + "spec_tokens,output_tokens,expected", + [ + ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])), # perfect match + ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])), # early mismatch + ([[1, 2], [3]], [[1, 2, 5], [3, 4]], + (2, 3, 3, [2, 1])), # multiple sequences + ([[1]], [[1, 2]], (1, 1, 1, [1])), # single token sequence + ([[]], [[5]], (0, 0, 0, [0])), # empty sequence + ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]], + (2, 6, 3, [2, 1, 0])), # multiple mismatches + ]) +def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): + """Test scheduling behavior with speculative decoding. + + This test verifies that: + 1. Speculated tokens get scheduled correctly + 2. Spec decoding stats properly count number of draft and accepted tokens + """ + if vllm_version_is("0.9.0"): + return + num_spec_tokens = max(1, max(len(t) for t in spec_tokens)) + scheduler = create_scheduler(num_speculative_tokens=num_spec_tokens) + requests = create_requests(num_requests=len(spec_tokens), num_tokens=1) + req_ids = [] + req_to_index = {} + for i, request in enumerate(requests): + scheduler.add_request(request) + req_ids.append(request.request_id) + req_to_index[request.request_id] = i + + # Schedule a decode, which will also draft speculative tokens + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == len(requests) + assert output.total_num_scheduled_tokens == len(requests) + for i in range(len(requests)): + req_id = requests[i].request_id + assert output.num_scheduled_tokens[req_id] == 1 + assert req_id not in output.scheduled_spec_decode_tokens + + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[0] for _ in range(len(requests))], + spec_token_ids=spec_tokens, + logprobs=None, + prompt_logprobs_dict={}, + ) + engine_core_outputs = scheduler.update_from_output(output, + model_runner_output) + + for i in range(len(requests)): + running_req = scheduler.running[i] + # The prompt token + assert running_req.num_computed_tokens == 1 + # The prompt token and the sampled token + assert running_req.num_tokens == 2 + # The prompt token, the sampled token, and the speculated tokens + assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i]) + + # No draft or accepted tokens counted yet + assert not engine_core_outputs or ( + engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None) + + # Schedule the speculated tokens for validation + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == 0 + # The sampled token and speculated tokens + assert output.total_num_scheduled_tokens == \ + len(requests) + sum(len(ids) for ids in spec_tokens) + for i in range(len(requests)): + req_id = requests[i].request_id + assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i]) + if spec_tokens[i]: + assert len(output.scheduled_spec_decode_tokens[req_id]) == \ + len(spec_tokens[i]) + else: + assert req_id not in output.scheduled_spec_decode_tokens + + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=output_tokens, + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + engine_core_outputs = scheduler.update_from_output(output, + model_runner_output) + + scheduler_stats = engine_core_outputs[0].scheduler_stats \ + if engine_core_outputs else None + if expected[0] == 0: + assert scheduler_stats.spec_decoding_stats is None # type: ignore + else: + assert scheduler_stats.spec_decoding_stats is not None # type: ignore + stats = scheduler_stats.spec_decoding_stats # type: ignore + assert stats.num_drafts == expected[0] + assert stats.num_draft_tokens == expected[1] + assert stats.num_accepted_tokens == expected[2] + assert stats.num_accepted_tokens_per_pos == expected[3] + + +def _assert_right_scheduler_output( + output: SchedulerOutput, + num_requests: int, + expected_num_scheduled_tokens: int, +): + """Check if SchedulerOutput is correct after remote KV cache hit.""" + + # We should inject the kv_connector_metadata. + assert len(output.kv_connector_metadata.requests) == num_requests + + # Only num_tokens - matched_num_new_tokens should be scheduled. + for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): + assert num_scheduled_tokens == expected_num_scheduled_tokens + + +def _assert_right_kv_cache_manager( + scheduler: AscendScheduler, + req_ids: list[str], + num_tokens: int, + block_size: int, + num_requests: int, + num_total_blocks: int, +): + """Check whether KVCacheManager is correct after allocate.""" + + # Make sure the request stats are right. + EXPECTED_TOTAL_BLOCKS = num_tokens // block_size + for req_id in req_ids: + blocks = (scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks[req_id]) + hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id] + assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0]. + num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS) + assert len(blocks) == EXPECTED_TOTAL_BLOCKS + assert len(hashes) == EXPECTED_TOTAL_BLOCKS + + # Make sure we actually touched all the blocks. + BLOCKS_PER_REQ = num_tokens / block_size + assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == + num_total_blocks - num_requests * BLOCKS_PER_REQ) + + +def _step_until_done( + scheduler: AscendScheduler, + output: SchedulerOutput, + model_runner_output: ModelRunnerOutput, +): + """Loop over schedule(), update_from_output() until finished.""" + + all_finished = False + _ = scheduler.update_from_output(output, model_runner_output) + while not all_finished: + # Schedule + a few iterations until stopping. + output = scheduler.schedule() + assert len(scheduler.running) + for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): + # We should be in the decode phase now. + assert num_scheduled_tokens == 1 + assert len(output.kv_connector_metadata.requests) == 0 + ecos = scheduler.update_from_output(output, model_runner_output)[0] + all_done = True + for eco in ecos.outputs: + if eco.finish_reason is None: + all_done = False + all_finished = all_done + + +def make_output(scheduler: AscendScheduler): + return ModelRunnerOutput( + req_ids=[req.request_id for req in scheduler.running], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(scheduler.running) + }, + sampled_token_ids=[[1000]] * len(scheduler.running), + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + + +def assert_scheduler_empty(scheduler: AscendScheduler): + """Confirm the scheduler is "empty" - i.e. no leaks.""" + # Scheduler Metadata. + assert len(scheduler.requests) == 0 + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == 0 + assert len(scheduler.finished_req_ids) == 0 + assert len(scheduler._cached_reqs_data) == 0 + + # EncoderCacheManager. + assert len(scheduler.encoder_cache_manager.freed) == 0 + assert len(scheduler.encoder_cache_manager.cached) == 0 + + # KVCache Manager. + if not vllm_version_is("0.9.0"): + assert len(scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks) == 0 + assert len(scheduler.kv_cache_manager.coordinator. + single_type_managers[0].num_cached_block) == 0 + assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0 + num_free_blocks = ( + scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) + assert num_free_blocks == ( + scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1) + + # NOTE(rob): just the ref count on blocks will be 0. The hash + # value, etc will remain since we lazily evict for prefix cache. + for block in scheduler.kv_cache_manager.block_pool.blocks: + assert block.ref_cnt == 0 + + +def test_memory_leak(): + """Test that we do not have a memory leak.""" + + scheduler = create_scheduler(enable_prefix_caching=True) + + NUM_REQUESTS = 5 + NUM_TOKENS = 10 + MAX_TOKENS = 10 + requests = create_requests(num_requests=NUM_REQUESTS, + num_tokens=NUM_TOKENS, + max_tokens=MAX_TOKENS) + + # Add each request. + for request in requests: + scheduler.add_request(request) + scheduler_output = scheduler.schedule() + model_runner_output = make_output(scheduler) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Iterate until done. + while True: + scheduler_output = scheduler.schedule() + if len(scheduler.running) == 0: + break + model_runner_output = make_output(scheduler) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Confirm no memory leak. + assert_scheduler_empty(scheduler) \ No newline at end of file diff --git a/tests/singlecard/core/test_ascend_scheduler_e2e.py b/tests/singlecard/core/test_ascend_scheduler_e2e.py new file mode 100644 index 0000000000..668dafced9 --- /dev/null +++ b/tests/singlecard/core/test_ascend_scheduler_e2e.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os + +import pytest +from vllm import LLM + +if os.getenv("VLLM_USE_V1", "0") != "1": + pytest.skip("Test package requires V1", allow_module_level=True) + +MODEL = "Qwen/Qwen2.5-0.5B-Instruct" +PROMPT = "Hello my name is Robert and I" + + +@pytest.fixture(scope="module") +def model() -> LLM: + return LLM( + MODEL, + enforce_eager=True, + enable_prefix_caching=True, + max_num_batched_tokens=200, + max_num_seqs=3, + additional_config={"ascend_scheduler_config": { + "enabled": True, + }}) + + +def test_concurrent_partial_prefill(model): + outputs = model.generate([PROMPT] * 3) + assert len(outputs) == 3 + for output in outputs: + assert len(output.outputs) == 1 + + +def test_prefix_cache_stats_is_recorded(model): + # 17 tokens will make sure first 16 tokens are cached in a block + input_tokens = {"prompt_token_ids": [101] * 129} + _ = model.generate([input_tokens]) + outputs = model.generate([input_tokens]) + assert outputs[0].num_cached_tokens == 128 \ No newline at end of file diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index 05c663feb5..2fa31c264c 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -14,16 +14,19 @@ # limitations under the License. # This file is a part of the vllm-ascend project. # +import time from collections import deque from typing import Iterable, Union from vllm.config import VllmConfig +from vllm.distributed.kv_events import KVEventBatch from vllm.logger import logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.utils import cdiv +from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler -from vllm.v1.engine import EngineCoreOutputs +from vllm.v1.engine import EngineCoreEventType, EngineCoreOutputs from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus @@ -49,11 +52,6 @@ def __init__( self.scheduled_req_ids: set[str] = set() self.running: list[Request] = [] - if self.vllm_config.kv_transfer_config is not None and \ - self.vllm_config.kv_transfer_config.is_kv_consumer: - raise ValueError( - "AscendScheduler cannot be used for decode nodes. ") - def schedule(self) -> SchedulerOutput: if self.scheduler_config.chunked_prefill_enabled: return super().schedule() @@ -68,6 +66,9 @@ def schedule(self) -> SchedulerOutput: # Spec decode-related. scheduled_spec_decode_tokens: dict[str, list[int]] = {} + # For logging. + scheduled_timestamp = time.monotonic() + # Record scheduled LoRA requests. scheduled_loras: set[int] = set() @@ -86,6 +87,18 @@ def skip_cur_request(): self.waiting.popleft() skipped_waiting_requests.appendleft(request) + num_prealloc_computed_tokens = 0 + # P/D: skip request if still waiting for remote kvs. + if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + is_ready = self._update_waiting_for_remote_kv(request) + if is_ready: + request.status = RequestStatus.WAITING + num_prealloc_computed_tokens = ( + request.num_computed_tokens) + else: + skip_cur_request() + continue + # Check that adding the request still respects the max_loras # constraint. if (self.lora_config and request.lora_request and @@ -95,39 +108,72 @@ def skip_cur_request(): skip_cur_request() continue - prompt_limit = self._get_prompt_limit(request) + num_external_computed_tokens = 0 + load_kv_async = False + # Get already-cached tokens. - computed_blocks, num_computed_tokens = ( - self.kv_cache_manager.get_computed_blocks(request)) - num_new_tokens = request.num_tokens - num_computed_tokens - if (0 < self.scheduler_config.long_prefill_token_threshold < - num_new_tokens): - num_new_tokens = ( - self.scheduler_config.long_prefill_token_threshold) - max_tokens_in_kvcache = (self.kv_cache_config.num_blocks * - self.block_size) - prompt_limit = min(prompt_limit, max_tokens_in_kvcache) - - # Finish request that exceeds prompt_limit or kv cache size. - if num_new_tokens > prompt_limit: - logger.warning( - "Input prompt (%d tokens) is too long" - " and exceeds limit of %d", - num_new_tokens, - prompt_limit, - ) - request.status = RequestStatus.FINISHED_IGNORED - self.finished_req_ids.add(request.request_id) # type: ignore - self.waiting.popleft() - continue + if num_prealloc_computed_tokens == 0: + new_computed_blocks, num_native_computed_tokens = \ + self.kv_cache_manager.get_computed_blocks( + request) + + # Get externally-cached tokens if using a KVConnector. + if self.connector is not None: + num_external_computed_tokens, load_kv_async = ( + self.connector.get_num_new_matched_tokens( + request, num_native_computed_tokens)) + + # Total computed tokens (local + external). + num_computed_tokens = (num_native_computed_tokens + + num_external_computed_tokens) + else: + # P/D: skip checking prefix cache if loaded from remote kvs. + new_computed_blocks = KVCacheBlocks.create_empty() + num_native_computed_tokens = 0 + + # Total computed tokens (allocated in prior step). + num_computed_tokens = num_prealloc_computed_tokens + + # P/D: loading remote KV, do not allocate for new work. + if load_kv_async: + assert num_external_computed_tokens > 0 + num_new_tokens = 0 + blocks = None + # Number of tokens to be scheduled. + else: + prompt_limit = self._get_prompt_limit(request) + # Get already-cached tokens. + computed_blocks, num_computed_tokens = ( + self.kv_cache_manager.get_computed_blocks(request)) + # We use `request.num_tokens` instead of + # `request.num_prompt_tokens` to consider the resumed + # requests, which have output tokens. + num_new_tokens = request.num_tokens - num_computed_tokens + max_tokens_in_kvcache = (self.kv_cache_config.num_blocks * + self.block_size) + prompt_limit = min(prompt_limit, max_tokens_in_kvcache) + + # Finish request that exceeds prompt_limit or kv cache size. + if num_new_tokens > prompt_limit: + logger.warning( + "Input prompt (%d tokens) is too long" + " and exceeds limit of %d", + num_new_tokens, + prompt_limit, + ) + request.status = RequestStatus.FINISHED_IGNORED + self.finished_req_ids.add( # type: ignore + request.request_id) # type: ignore + self.waiting.popleft() + continue - if num_new_tokens > token_budget: - # Scheduling would exceed token_budget, skip. - skip_cur_request() - continue + if num_new_tokens > token_budget: + # Scheduling would exceed token_budget, skip. + skip_cur_request() + continue + assert num_new_tokens > 0 + blocks = computed_blocks.blocks[0] - assert num_new_tokens > 0 - blocks = computed_blocks.blocks[0] watermark = getattr(self.scheduler_config, "watermark", 0.01) if not self._check_watermark_for_prefill(request, num_new_tokens, blocks, watermark): @@ -136,13 +182,38 @@ def skip_cur_request(): continue new_blocks = self.kv_cache_manager.allocate_slots( - request, num_new_tokens, new_computed_blocks=computed_blocks) + request, + num_new_tokens + num_external_computed_tokens, + num_native_computed_tokens, + new_computed_blocks=computed_blocks, + num_lookahead_tokens=self.num_lookahead_tokens, + delay_cache_blocks=load_kv_async) if new_blocks is None: # The request cannot be scheduled. break + # KVConnector: update internal state after allocation. + # This information is used to determine if a load is + # needed for this request. + if num_external_computed_tokens: + assert self.connector is not None + self.connector.update_state_after_alloc( + request, + new_computed_blocks + new_blocks, + num_external_computed_tokens, + ) + self.waiting.popleft() + if load_kv_async: + # If loading async, allocate memory and put request + # into the WAITING_FOR_REMOTE_KV state. + skipped_waiting_requests.appendleft(request) + request.status = RequestStatus.WAITING_FOR_REMOTE_KVS + continue self.running.append(request) + if self.log_stats: + request.record_event(EngineCoreEventType.SCHEDULED, + scheduled_timestamp) self.scheduled_req_ids.add(request.request_id) # Check request status. if request.status == RequestStatus.WAITING: @@ -161,6 +232,9 @@ def skip_cur_request(): token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens + # Count the number of prifix cached tokens. + if request.num_cached_tokens < 0: + request.num_cached_tokens = num_computed_tokens # Put back any skipped requests at the head of the waiting queue if skipped_waiting_requests: @@ -179,16 +253,45 @@ def skip_cur_request(): num_new_tokens = (request.num_tokens_with_spec - request.num_computed_tokens) - if (0 < self.scheduler_config.long_prefill_token_threshold < - num_new_tokens): - num_new_tokens = ( - self.scheduler_config.long_prefill_token_threshold) + assert (request.num_tokens - request.num_computed_tokens) == 1 num_new_tokens = min(num_new_tokens, token_budget) - assert num_new_tokens == 1 + # Make sure the input position does not exceed the max model len. + # This is necessary when using spec decoding. + num_new_tokens = min( + num_new_tokens, + self.max_model_len - request.num_computed_tokens) + # Check that adding the request still respects the max_loras + # constraint. + if self.lora_config and request.lora_request and ( + len(scheduled_loras) == self.lora_config.max_loras + and request.lora_request.lora_int_id + not in scheduled_loras): + # Scheduling would exceed max_loras, skip. + num_new_tokens = 0 + + if num_new_tokens == 0: + # The request cannot be scheduled because one of the following + # reason: + # 1. No new tokens to schedule. This may happen when PP>1 and + # we have already scheduled all prompt tokens but they are + # not finished yet. + # 2. Adding the request exceeds the max_loras constraint. + # NOTE(woosuk): Here, by doing `continue` instead of `break`, + # we do not strictly follow the FCFS scheduling policy and + # allow the lower-priority requests to be scheduled. + req_index += 1 + continue + + num_draft_tokens = max( + num_new_tokens + request.num_computed_tokens - + request.num_tokens, 0) while True: new_blocks = self.kv_cache_manager.allocate_slots( - request, num_new_tokens) + request, + num_new_tokens, + num_draft_tokens=num_draft_tokens, + num_lookahead_tokens=self.num_lookahead_tokens) if new_blocks is None: # The request cannot be scheduled. # Preempt the lowest-priority request. @@ -196,6 +299,10 @@ def skip_cur_request(): self.kv_cache_manager.free(preempted_req) preempted_req.status = RequestStatus.PREEMPTED preempted_req.num_computed_tokens = 0 + if self.log_stats: + preempted_req.record_event( + EngineCoreEventType.PREEMPTED, + scheduled_timestamp) self.waiting.appendleft(preempted_req) preempted_reqs.append(preempted_req) if preempted_req == request: @@ -230,6 +337,10 @@ def skip_cur_request(): scheduled_spec_decode_tokens[request.request_id] = ( request.spec_token_ids) + # Record scheduled LoRA requests. + if self.lora_config and request.lora_request: + scheduled_loras.add(request.lora_request.lora_int_id) + # Check if the scheduling constraints are satisfied. total_num_scheduled_tokens = sum(num_scheduled_tokens.values()) assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens @@ -297,6 +408,11 @@ def skip_cur_request(): meta = self.connector.build_connector_meta(scheduler_output) scheduler_output.kv_connector_metadata = meta + events = self.kv_cache_manager.take_events() + if events: + batch = KVEventBatch(ts=time.time(), events=events) + self.kv_event_publisher.publish(batch) + # Advance the number of computed tokens for the request AFTER # the request is scheduled. # 1. The scheduler_output of the current step has to include the @@ -388,7 +504,8 @@ def update_from_output( if num_tokens_scheduled == 0: # The request was not scheduled in this step. continue - self.scheduled_req_ids.remove(req_id) + if req_id in self.scheduled_req_ids: + self.scheduled_req_ids.remove(req_id) return super().update_from_output(scheduler_output, model_runner_output) From 9e5e1176bd485ca9e2873d746f053139f64f8761 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Wed, 11 Jun 2025 20:55:57 +0800 Subject: [PATCH 029/153] enable_eplb -> dynamic_eplb --- vllm_ascend/ascend_config.py | 2 +- vllm_ascend/ops/fused_moe.py | 2 -- vllm_ascend/worker/model_runner_v1.py | 8 ++++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index ca29859f4d..b942086673 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -39,7 +39,7 @@ def __init__(self, vllm_config): self.expert_tensor_parallel_size = int( additional_config.get("expert_tensor_parallel_size", 0)) self.expert_map_path = additional_config.get("expert_map_path", None) - self.enable_eplb = additional_config.get("enable_eplb", False) + self.dynamic_eplb = additional_config.get("dynamic_eplb", False) class TorchairGraphConfig: diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 438f15b7bf..d5d910ad48 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1159,8 +1159,6 @@ def forward(self, global_redundant_expert_num=self.global_redundant_expert_num, **kwargs) - print("log2phy:", self.log2phy) - # print("expert_map:", self.expert_map) self.calculate_moe_load() if self.enable_multistream_shared_expert and not is_prefill: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index a8173c3c40..1198609b90 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -368,8 +368,8 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.dp_rank = vllm_config.parallel_config.data_parallel_rank #EPLB - self.enable_eplb = ascend_config.enable_eplb - if self.enable_eplb == True: + self.dynamic_eplb = ascend_config.dynamic_eplb + if self.dynamic_eplb == True: self.eplb_adaptor = None self.eplb_updator = EplbUpdator() @@ -1216,7 +1216,7 @@ def execute_model( intermediate_tensors: Optional[IntermediateTensors] = None, ) -> Union[ModelRunnerOutput, torch.Tensor]: - if self.enable_eplb: + if self.dynamic_eplb: if self.eplb_adaptor == None: self.eplb_adaptor = VllmEplbAdaptor(model=self.model) self.eplb_updator.set_adaptor(self.eplb_adaptor) @@ -1337,7 +1337,7 @@ def execute_model( print(f"Profile execute duration [{captured_name}]:", " ".join(dr_str)) - if self.enable_eplb: + if self.dynamic_eplb: self.eplb_updator.forward_end() return model_runner_output From 2498d297ae3cae20742563e31e89480dc25388a6 Mon Sep 17 00:00:00 2001 From: ttanzhiqiang <38750855+ttanzhiqiang@users.noreply.github.com> Date: Thu, 12 Jun 2025 10:44:33 +0800 Subject: [PATCH 030/153] add custom ascendc kernel vocabparallelembedding (#796) This PR add custom ascendc kernel vocabparallelembedding support in vllm-ascend, related CMakeLists and setuptools is also added in this PR. pytest -s benchmarks/ops/ben_vocabparallelembedding.py pytest -s tests/ops/test_vocabparallelembedding.py --------- Signed-off-by: ttanzhiqiang <389825161@qq.com> --- CMakeLists.txt | 2 - benchmarks/ops/ben_vocabparallelembedding.py | 144 ++++++++ .../get_masked_input_and_mask_kernel.cpp | 345 ++++++++++++++++++ csrc/ops.h | 14 + csrc/torch_binding.cpp | 116 ++++++ tests/ops/test_vocabparallelembedding.py | 91 +++++ 6 files changed, 710 insertions(+), 2 deletions(-) create mode 100644 benchmarks/ops/ben_vocabparallelembedding.py create mode 100644 csrc/kernels/get_masked_input_and_mask_kernel.cpp create mode 100644 tests/ops/test_vocabparallelembedding.py diff --git a/CMakeLists.txt b/CMakeLists.txt index a2c3ad2e16..8d06c75f29 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,5 +96,3 @@ target_link_libraries( target_link_options(vllm_ascend_C PRIVATE "-Wl,-rpath,$ORIGIN:$ORIGIN/lib") install(TARGETS vllm_ascend_C vllm_ascend_kernels DESTINATION ${VLLM_ASCEND_INSTALL_PATH}) - - diff --git a/benchmarks/ops/ben_vocabparallelembedding.py b/benchmarks/ops/ben_vocabparallelembedding.py new file mode 100644 index 0000000000..e91cfed7b9 --- /dev/null +++ b/benchmarks/ops/ben_vocabparallelembedding.py @@ -0,0 +1,144 @@ +from typing import Tuple + +import numpy as np +import pytest +import torch +import torch_npu # noqa: F401 +import vllm # noqa: F401 + +import vllm_ascend.platform # noqa: F401 + + +def benchmark_npu(fn, num_iterations=100, num_warmup_iterations=50): + """ + Benchmark function for NPU operations + + Args: + fn: Function to benchmark + num_iterations: Number of timing iterations + num_warmup_iterations: Number of warmup iterations + + Returns: + float: Minimum elapsed time in seconds + """ + start = torch.npu.Event(enable_timing=True) + end = torch.npu.Event(enable_timing=True) + times = np.zeros(num_iterations + num_warmup_iterations) + + # Run iterations + for i in range(num_warmup_iterations + num_iterations): + with torch.no_grad(): + start.record() + fn() # Execute the function + end.record() + torch.npu.synchronize() + times[i] = start.elapsed_time(end) + + # Remove warmup iterations and convert to seconds + times = times[num_warmup_iterations:] + elapsed_time = np.amin(times) / 1000 + return elapsed_time + + +def get_masked_input_and_mask_ref( + input_: torch.Tensor, org_vocab_start_index: int, + org_vocab_end_index: int, num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Reference implementation for verification""" + org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < + org_vocab_end_index) + added_vocab_mask = (input_ >= added_vocab_start_index) & ( + input_ < added_vocab_end_index) + added_offset = added_vocab_start_index - ( + org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding + valid_offset = (org_vocab_start_index * + org_vocab_mask) + (added_offset * added_vocab_mask) + vocab_mask = org_vocab_mask | added_vocab_mask + masked_input = vocab_mask * (input_ - valid_offset) + return masked_input, ~vocab_mask + + +DTYPES = [torch.int32] +SHAPES = [(3, 4, 5)] +DEVICES = [f"npu:{0}"] +SEEDS = [0] + + +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEEDS) +@torch.inference_mode() +def test_get_masked_input_and_mask( + shape: Tuple[int, ...], + dtype: torch.dtype, + device: str, + seed: int, +) -> None: + # Set random seed and device + torch.manual_seed(seed) + torch.set_default_device(device) + + # Generate random input tensor + input_tensor = torch.randint(0, 1000, shape, dtype=dtype) + + # Test parameters + test_case = { + "org_start": 100, + "org_end": 200, + "padding": 0, + "added_start": 300, + "added_end": 400, + } + + # Define reference function + def ref_fn(): + return get_masked_input_and_mask_ref(input_tensor, + test_case["org_start"], + test_case["org_end"], + test_case["padding"], + test_case["added_start"], + test_case["added_end"]) + + # Define custom function + def custom_fn(): + return torch.ops._C.get_masked_input_and_mask(input_tensor, + test_case["org_start"], + test_case["org_end"], + test_case["padding"], + test_case["added_start"], + test_case["added_end"]) + + # Get results for correctness testing + ref_masked_input, ref_mask = ref_fn() + custom_masked_input, custom_mask = custom_fn() + + # Benchmark both implementations + ref_time = benchmark_npu(ref_fn) + custom_time = benchmark_npu(custom_fn) + + # Print performance results + print("\nPerformance Results:") + print(f"Reference implementation: {ref_time*1000:.3f} ms") + print(f"Custom implementation: {custom_time*1000:.3f} ms") + print(f"Speedup: {ref_time/custom_time:.2f}x") + + # Compare results for correctness + ref_masked_input = ref_masked_input.to(dtype) + print("\nResults comparison:") + print("custom_masked_input:", custom_masked_input) + print("ref_masked_input:", ref_masked_input) + print("custom_mask:", custom_mask) + print("ref_mask:", ref_mask) + torch.testing.assert_close( + custom_masked_input, + ref_masked_input, + rtol=1e-5, + atol=1e-5, + msg=f"Masked input mismatch for case: {test_case}") + torch.testing.assert_close(custom_mask, + ref_mask, + rtol=1e-5, + atol=1e-5, + msg=f"Mask mismatch for case: {test_case}") diff --git a/csrc/kernels/get_masked_input_and_mask_kernel.cpp b/csrc/kernels/get_masked_input_and_mask_kernel.cpp new file mode 100644 index 0000000000..47ce826f22 --- /dev/null +++ b/csrc/kernels/get_masked_input_and_mask_kernel.cpp @@ -0,0 +1,345 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + */ + +#include "kernel_operator.h" +#include "kernel_tensor_impl.h" +#include "kernel_type.h" +#include "types.h" +#include "utils.h" +using vllm_ascend::AccType; + +template +class GetMaskedInputAndMask { +public: + __aicore__ inline GetMaskedInputAndMask() {} + + __aicore__ inline ~GetMaskedInputAndMask() { + pipe.Reset(); + } + + + __aicore__ inline void Init( + __gm__ scalar_t* input, + __gm__ scalar_t* masked_input, + __gm__ bool* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size) + { + // Initialize basic parameters + input_ = input; + masked_input_ = masked_input; + mask_out_ = mask_out; + org_vocab_start_index_ = org_vocab_start_index; + org_vocab_end_index_ = org_vocab_end_index; + size_ = ((size + 31) / 32) * 32; + added_offset_ = added_vocab_start_index - + (org_vocab_end_index - org_vocab_start_index) - + num_org_vocab_padding; + added_vocab_start_index_ = added_vocab_start_index; + added_vocab_end_index_ = added_vocab_end_index; + + // Initialize global tensors + inputGlobal.SetGlobalBuffer(input); + maskedOutputGlobal.SetGlobalBuffer(masked_input); + maskOutGlobal.SetGlobalBuffer(mask_out); + + // Initialize queues + pipe.InitBuffer(inQueue, 1, size_ * sizeof(scalar_t)); + pipe.InitBuffer(outQueue, 1, size_ * sizeof(scalar_t)); + pipe.InitBuffer(maskQueue, 1, size_ * sizeof(bool)); + + // Initialize calculation buffers + pipe.InitBuffer(calc_buf_1, size_ * sizeof(float)); + pipe.InitBuffer(calc_buf_2, size_ * sizeof(float)); + + // Initialize result queues + pipe.InitBuffer(result_ge_que, BUFFER_NUM, size_ * sizeof(float)); + pipe.InitBuffer(result_le_que, BUFFER_NUM, size_ * sizeof(float)); + pipe.InitBuffer(result_org_mask_que, BUFFER_NUM, size_ * sizeof(float)); + pipe.InitBuffer(result_add_mask_que, BUFFER_NUM, size_ * sizeof(float)); + + // Initialize temporary buffers + pipe.InitBuffer(start_buf, size_ * sizeof(float)); + pipe.InitBuffer(end_buf, size_ * sizeof(float)); + pipe.InitBuffer(inputFloat_buf, size_ * sizeof(float)); + pipe.InitBuffer(validOffset_buf, size_ * sizeof(float)); + pipe.InitBuffer(vocabMask_buf_, size_ * sizeof(int8_t)); + pipe.InitBuffer(ones_buf_, size_ * sizeof(float)); + } + + __aicore__ inline void Process() + { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + __aicore__ inline void CopyIn() + { + AscendC::LocalTensor inputLocal = inQueue.AllocTensor(); + AscendC::DataCopy(inputLocal, inputGlobal, size_); + inQueue.EnQue(inputLocal); + } + + __aicore__ inline void CompareWithValue( + AscendC::LocalTensor& result, + const AscendC::LocalTensor& input, + const AscendC::LocalTensor& compare_value, + bool is_greater_equal) { + + AscendC::LocalTensor compute_buf = calc_buf_1.Get(); + if (is_greater_equal) { + AscendC::Max(compute_buf, input, compare_value, size_); + AscendC::Sub(compute_buf, compare_value, compute_buf, size_); + } else { + AscendC::Max(compute_buf, input, compare_value, size_); + AscendC::Sub(compute_buf, compute_buf, compare_value, size_); + } + + AscendC::Abs(compute_buf, compute_buf, size_); + AscendC::Mins(compute_buf, compute_buf, MIN_ACCURACY_FP32, size_); + AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_); + AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_); + AscendC::Muls(compute_buf, compute_buf, MAX_MUL_2_FP32, size_); + AscendC::Adds(compute_buf, compute_buf, NEGATIVE_ONE_FP32, size_); + AscendC::Abs(compute_buf, compute_buf, size_); + + AscendC::LocalTensor compute_buf_fp16 = calc_buf_2.Get(); + AscendC::Cast(compute_buf_fp16, compute_buf, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(result, compute_buf_fp16, AscendC::RoundMode::CAST_NONE, size_); + } + + __aicore__ inline void ComputeRangeMask( + AscendC::LocalTensor& range_mask, + const AscendC::LocalTensor& input, + const float start_value, + const float end_value) { + + // Use already initialized buffers + AscendC::LocalTensor start_value_tensor = start_buf.Get(); + AscendC::LocalTensor end_value_tensor = end_buf.Get(); + + AscendC::Duplicate(start_value_tensor, start_value, size_); + AscendC::Duplicate(end_value_tensor, end_value, size_); + + AscendC::LocalTensor ge_result = result_ge_que.AllocTensor(); + AscendC::LocalTensor lt_result = result_le_que.AllocTensor(); + + CompareWithValue(ge_result, start_value_tensor, input, true); + CompareWithValue(lt_result, input, end_value_tensor, false); + + AscendC::And(range_mask, ge_result, lt_result, size_); + } + + __aicore__ inline void Compute() { + AscendC::LocalTensor inputLocal = inQueue.DeQue(); + AscendC::LocalTensor maskedLocal = outQueue.AllocTensor(); + AscendC::LocalTensor maskLocal = maskQueue.AllocTensor(); + + AscendC::LocalTensor inputFloat = inputFloat_buf.Get(); + AscendC::Cast(inputFloat, inputLocal, AscendC::RoundMode::CAST_NONE, size_); + + // Calculate mask for org_vocab range + // org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) + AscendC::LocalTensor orgVocabMask = result_org_mask_que.AllocTensor(); + ComputeRangeMask(orgVocabMask, + inputFloat, + static_cast(org_vocab_start_index_), + static_cast(org_vocab_end_index_)); + + // Calculate mask for added_vocab range + // added_vocab_mask = (input_ >= added_vocab_start_index) & (input_ < added_vocab_end_index) + AscendC::LocalTensor addedVocabMask = result_add_mask_que.AllocTensor(); + ComputeRangeMask(addedVocabMask, + inputFloat, + static_cast(added_vocab_start_index_), + static_cast(added_vocab_end_index_)); + + // Calculate validOffset + // valid_offset = (org_vocab_start_index * org_vocab_mask) + (added_offset * added_vocab_mask) + AscendC::LocalTensor validOffset = validOffset_buf.Get(); + AscendC::LocalTensor constOrgStartIndex = start_buf.Get(); + + AscendC::Duplicate(constOrgStartIndex, float(org_vocab_start_index_), size_); + + AscendC::LocalTensor orgVocabMask_fp16; + AscendC::LocalTensor orgVocabMask_fp32; + AscendC::Cast(orgVocabMask_fp16, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(orgVocabMask_fp32, orgVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Mul(validOffset, + constOrgStartIndex, + orgVocabMask_fp32, + size_); + + AscendC::LocalTensor addedOffset; + AscendC::LocalTensor addedOffsetTensor = end_buf.Get(); + AscendC::Duplicate(addedOffsetTensor, float(added_offset_), size_); + + AscendC::LocalTensor addedVocabMask_fp16; + AscendC::LocalTensor addedVocabMask_fp32; + AscendC::Cast(addedVocabMask_fp16, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(addedVocabMask_fp32, addedVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Mul(addedOffset, + addedOffsetTensor, + addedVocabMask_fp32, + size_); + + AscendC::Add(validOffset, validOffset, addedOffset, size_); + + // vocab_mask = org_vocab_mask | added_vocab_mask + AscendC::LocalTensor vocabMask = vocabMask_buf_.Get(); + + AscendC::Or(vocabMask, + orgVocabMask, + addedVocabMask, + size_); + + AscendC::Sub(inputFloat, inputFloat, validOffset, size_); + + // input_ = vocab_mask * (input_ - valid_offset) + AscendC::LocalTensor vocabMask_fp16; + AscendC::LocalTensor vocabMask_fp32; + AscendC::Cast(vocabMask_fp16, vocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(vocabMask_fp32, vocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::LocalTensor inputFloat_fp32; + AscendC::Mul(inputFloat, inputFloat, vocabMask_fp32, size_); + + AscendC::Cast(maskedLocal, inputFloat, AscendC::RoundMode::CAST_CEIL, size_); + outQueue.EnQue(maskedLocal); + + // ~vocab_mask + AscendC::LocalTensor ones_tensor = ones_buf_.Get(); + AscendC::Duplicate(ones_tensor, (float)1, size_); + AscendC::LocalTensor maskLocal_fp32; + + AscendC::Sub(maskLocal_fp32, + ones_tensor, + vocabMask_fp32, + size_); + + AscendC::LocalTensor maskLocal_fp16; + AscendC::Cast(maskLocal_fp16, maskLocal_fp32, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(maskLocal, maskLocal_fp16, AscendC::RoundMode::CAST_NONE, size_); + maskQueue.EnQue(maskLocal); + inQueue.FreeTensor(inputLocal); + } + + __aicore__ inline void CopyOut() + { + AscendC::LocalTensor maskedLocal = outQueue.DeQue(); + AscendC::LocalTensor maskLocal = maskQueue.DeQue(); + + AscendC::DataCopy(maskedOutputGlobal, maskedLocal, size_); + AscendC::DataCopy(maskOutGlobal, maskLocal, size_); + + outQueue.FreeTensor(maskedLocal); + maskQueue.FreeTensor(maskLocal); + } + +private: + static constexpr int32_t BUFFER_NUM = 2; + AscendC::TPipe pipe; + AscendC::TQue inQueue; + AscendC::TQue outQueue, maskQueue; + AscendC::GlobalTensor inputGlobal, maskedOutputGlobal; + AscendC::GlobalTensor maskOutGlobal; + AscendC::TBuf calc_buf_1; + AscendC::TBuf calc_buf_2; + AscendC::TQue result_ge_que; + AscendC::TQue result_le_que; + AscendC::TQue result_org_mask_que; + AscendC::TQue result_add_mask_que; + + // Temporary buffers + AscendC::TBuf start_buf; + AscendC::TBuf end_buf; + + // Temporary buffers continued + AscendC::TBuf inputFloat_buf; + AscendC::TBuf validOffset_buf; + AscendC::TBuf vocabMask_buf_; + AscendC::TBuf ones_buf_; + + __gm__ scalar_t *input_, *masked_input_; + __gm__ bool *mask_out_; + int64_t size_; + int64_t org_vocab_start_index_, org_vocab_end_index_; + int64_t added_vocab_start_index_, added_vocab_end_index_; + int64_t added_offset_; + + static constexpr float MIN_ACCURACY_FP32 = 1.1754943508222875e-38; + static constexpr float MAX_MUL_1_FP32 = 1125899906842624; + static constexpr float MAX_MUL_2_FP32 = 67108864; + static constexpr float NEGATIVE_ONE_FP32 = -1.0f; +}; + +extern "C" __global__ __aicore__ void get_masked_input_and_mask_kernel( + __gm__ int32_t* input, + __gm__ int32_t* masked_input, + __gm__ bool* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size, + const uint32_t loop_cnt, + const uint32_t aiv_num) +{ + { + GetMaskedInputAndMask op{}; + + for (int64_t i = AscendC::GetBlockIdx(); i < loop_cnt; i += aiv_num) { + op.Init(input + i * size/loop_cnt, + masked_input + i * size/loop_cnt, + mask_out + i * size/loop_cnt, + org_vocab_start_index, org_vocab_end_index, + num_org_vocab_padding, added_vocab_start_index, + added_vocab_end_index, size/loop_cnt); + + op.Process(); + } + } // op destructor called here +} + +namespace vllm_ascend { + +void get_masked_input_and_mask_impl( + void* stream, + void* input, + void* masked_input, + void* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size, + const uint32_t loop_cnt, + const uint32_t aiv_num) +{ + get_masked_input_and_mask_kernel<<>>( + static_cast(input), + static_cast(masked_input), + static_cast(mask_out), + org_vocab_start_index, + org_vocab_end_index, + num_org_vocab_padding, + added_vocab_start_index, + added_vocab_end_index, + size, + loop_cnt, + aiv_num); +} + +} // namespace vllm_ascend + diff --git a/csrc/ops.h b/csrc/ops.h index b921b2bf99..b1bc6028eb 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -31,6 +31,20 @@ namespace vllm_ascend { const int headSize, const int64_t numTokens, const uint32_t loopCnt, uint32_t aivNum); + extern void get_masked_input_and_mask_impl( + void* stream, + void* input, + void* masked_input, + void* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size, + const uint32_t loop_cnt, + const uint32_t aiv_num); + torch::Tensor weak_ref_tensor(torch::Tensor& tensor) { if (!tensor.is_privateuseone()) { throw std::runtime_error("Tensor must be on NPU device"); diff --git a/csrc/torch_binding.cpp b/csrc/torch_binding.cpp index c4154381da..001a9314d0 100644 --- a/csrc/torch_binding.cpp +++ b/csrc/torch_binding.cpp @@ -99,6 +99,112 @@ std::tuple rotary_embedding(at::Tensor &positions, at::T return {query_dst, key_dst}; } +std::tuple get_masked_input_and_mask( + at::Tensor &input, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index) + /* + https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/vocab_parallel_embedding.py#L161-L198 + Embedding parallelized in the vocabulary dimension. + + Adapted from torch.nn.Embedding, note that we pad the vocabulary size to + make sure it is divisible by the number of model parallel GPUs. + + In order to support various loading methods, we ensure that LoRA-added + embeddings are always at the end of TP-sharded tensors. In other words, + we shard base embeddings and LoRA embeddings separately (both padded), + and place them in the same tensor. + In this example, we will have the original vocab size = 1010, + added vocab size = 16 and padding to 64. Therefore, the total + vocab size with padding will be 1088 (because we first pad 1010 to + 1024, add 16, and then pad to 1088). + Therefore, the tensor format looks like the following: + TP1, rank 0 (no sharding): + |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >| + corresponding token_id: | 0 | 1 | ... | 1009 | -1 | ... | -1 | 1010 | ... | 1015 | -1 | ... | -1 | + index: | 0 | 1 | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 | + + TP2, rank 0: + |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >| + corresponding token_id: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 1000 | ... | 1015 | -1 | ... | -1 | + index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 527 | 520 | ... | 543 | + TP2, rank 1: + |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >| + corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1 | ... | -1 | -1 | ... | -1 | -1 | ... | -1 | + index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 519 | 520 | ... | 543 | + Parameters: + org_vocab_start_index //base embeddings start + org_vocab_end_index //base embeddings end + num_org_vocab_padding //base embeddings padding + added_vocab_start_index //LoRA embeddings start + added_vocab_end_index //LoRA embeddings end + */ +{ + // Input validation + TORCH_CHECK(input.dim() >= 1, "input must have at least 1 dimension"); + TORCH_CHECK(org_vocab_start_index >= 0, "org_vocab_start_index must be non-negative"); + TORCH_CHECK(org_vocab_end_index >= org_vocab_start_index, "org_vocab_end_index must be greater than org_vocab_start_index"); + TORCH_CHECK(num_org_vocab_padding >= 0, "num_org_vocab_padding must be non-negative"); + TORCH_CHECK(added_vocab_start_index >= org_vocab_end_index, "added_vocab_start_index must be greater than org_vocab_end_index"); + TORCH_CHECK(added_vocab_end_index >= added_vocab_start_index, "added_vocab_end_index must be greater than added_vocab_start_index"); + + // Get total number of elements + int64_t size = input.numel(); + + // Create output tensors + at::Tensor masked_input = at::empty_like(input); + at::Tensor mask = at::empty_like(input).to(at::kBool); + + // Get data pointers + void *input_ptr = input.data_ptr(); + void *masked_input_ptr = masked_input.data_ptr(); + void *mask_ptr = mask.data_ptr(); + + // Get current stream + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); + + // Get scalar type + at::ScalarType scalar_type = input.scalar_type(); + + // Create and configure OpCommand + at_npu::native::OpCommand cmd; + cmd.Name("get_masked_input_and_mask"); + cmd.SetCustomHandler([scalar_type, size, stream, + input_ptr, masked_input_ptr, mask_ptr, + org_vocab_start_index, org_vocab_end_index, + num_org_vocab_padding, added_vocab_start_index, + added_vocab_end_index]() -> int { + // Get platform info + fe::PlatFormInfos platform_infos; + int device_id = 0; + fe::PlatformInfoManager::GeInstance().GetRuntimePlatformInfosByDevice(device_id, platform_infos); + uint32_t aivNum = platform_infos.GetCoreNumByType("aiv"); + uint32_t loop_cnt = (size + aivNum - 1) / aivNum; + + // Call implementation + get_masked_input_and_mask_impl( + stream, + input_ptr, + masked_input_ptr, + mask_ptr, + org_vocab_start_index, + org_vocab_end_index, + num_org_vocab_padding, + added_vocab_start_index, + added_vocab_end_index, + size, + loop_cnt, + aivNum); + + return 0; + }); + cmd.Run(); + return {masked_input, mask}; +} + void verify_tensor(std::string const& name, at::Tensor const& t, int64_t const size_0, int64_t const size_1, c10::ScalarType const type) { @@ -194,6 +300,16 @@ TORCH_LIBRARY_EXPAND(_C, ops) " Tensor! key, int head_size," " Tensor cos_sin_cache, bool is_neox) -> (Tensor query, Tensor key)"); ops.impl("rotary_embedding", torch::kPrivateUse1, &vllm_ascend::rotary_embedding); + + ops.def( + "get_masked_input_and_mask(Tensor input, " + " int org_vocab_start_index, " + " int org_vocab_end_index, " + " int num_org_vocab_padding, " + " int added_vocab_start_index, " + " int added_vocab_end_index) -> (Tensor masked_input, Tensor mask)"); + ops.impl("get_masked_input_and_mask", torch::kPrivateUse1, &vllm_ascend::get_masked_input_and_mask); + ops.def( "advance_step_flashattn_ascendc(int num_seqs, int num_queries, int block_size," " Tensor! input_tokens, Tensor! sampled_token_ids, Tensor! input_positions," diff --git a/tests/ops/test_vocabparallelembedding.py b/tests/ops/test_vocabparallelembedding.py new file mode 100644 index 0000000000..97d6c70596 --- /dev/null +++ b/tests/ops/test_vocabparallelembedding.py @@ -0,0 +1,91 @@ +from typing import Tuple + +import pytest +import torch +import torch_npu # noqa: F401 + +import vllm_ascend.platform # noqa: F401 + +# Test parameters +DTYPES = [torch.int32] +#SHAPES = [(100,), (5, 20), (3, 4, 5)] # Various tensor shapes +#SHAPES = [(3, 4, 8), (3, 4, 5)] # Various tensor shapes +SHAPES = [(3, 4, 3)] +DEVICES = [f"npu:{0}"] +SEEDS = [0] + + +def get_masked_input_and_mask_ref( + input_: torch.Tensor, org_vocab_start_index: int, + org_vocab_end_index: int, num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Reference implementation for verification""" + org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < + org_vocab_end_index) + added_vocab_mask = (input_ >= added_vocab_start_index) & ( + input_ < added_vocab_end_index) + added_offset = added_vocab_start_index - ( + org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding + valid_offset = (org_vocab_start_index * + org_vocab_mask) + (added_offset * added_vocab_mask) + vocab_mask = org_vocab_mask | added_vocab_mask + masked_input = vocab_mask * (input_ - valid_offset) + return masked_input, ~vocab_mask + + +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEEDS) +@torch.inference_mode() +def test_get_masked_input_and_mask( + shape: Tuple[int, ...], + dtype: torch.dtype, + device: str, + seed: int, +) -> None: + # Set random seed + torch.manual_seed(seed) + torch.set_default_device(device) + + # Generate random input tensor + input_tensor = torch.randint(0, 1000, shape, dtype=dtype) + + # Test parameters + test_case = { + "org_start": 100, + "org_end": 200, + "padding": 0, + "added_start": 300, + "added_end": 400, + } + + # Get reference result + ref_masked_input, ref_mask = get_masked_input_and_mask_ref( + input_tensor, test_case["org_start"], test_case["org_end"], + test_case["padding"], test_case["added_start"], test_case["added_end"]) + + # Get custom op result + print("input_tensor:", input_tensor) + custom_masked_input, custom_mask = torch.ops._C.get_masked_input_and_mask( + input_tensor, test_case["org_start"], test_case["org_end"], + test_case["padding"], test_case["added_start"], test_case["added_end"]) + + ref_masked_input = ref_masked_input.to(dtype) + print("custom_masked_input:", custom_masked_input) + print("ref_masked_input:", ref_masked_input) + print("custom_mask:", custom_mask) + print("ref_mask:", ref_mask) + # Compare results + torch.testing.assert_close( + custom_masked_input, + ref_masked_input, + rtol=1e-5, + atol=1e-5, + msg=f"Masked input mismatch for case: {test_case}") + torch.testing.assert_close(custom_mask, + ref_mask, + rtol=1e-5, + atol=1e-5, + msg=f"Mask mismatch for case: {test_case}") From dd207cb26155f42479120de7bd451f3c06efaed9 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Thu, 12 Jun 2025 10:46:41 +0800 Subject: [PATCH 031/153] [CI][Benchmark] Add new model and v1 test to perf benchmarks (#1099) ### What this PR does / why we need it? - Add qwen2.5-7b-instruct test - Add v1 test --------- Signed-off-by: wangli --- .github/workflows/nightly_benchmarks.yaml | 28 ++++++++++++++--------- benchmarks/tests/latency-tests.json | 10 ++++++++ benchmarks/tests/serving-tests.json | 24 +++++++++++++++++++ benchmarks/tests/throughput-tests.json | 11 +++++++++ 4 files changed, 62 insertions(+), 11 deletions(-) diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 6ee1b45ed8..27ddb6571b 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -45,13 +45,18 @@ jobs: test: if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} - name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }} + name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}, use_v1=${{ matrix.vllm_use_v1 }} runs-on: 'linux-arm64-npu-static-8' strategy: matrix: include: - vllm_branch: v0.9.1 vllm_ascend_branch: main + vllm_use_v1: 0 + - vllm_branch: v0.9.0 + vllm_ascend_branch: main + vllm_use_v1: 1 + max-parallel: 1 container: image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 volumes: @@ -71,6 +76,7 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }} ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }} + VLLM_USE_V1: ${{ matrix.vllm_use_v1 }} steps: - name: Check npu and CANN info run: | @@ -140,7 +146,7 @@ jobs: - name: Install elastic_tool if: github.event_name != 'pull_request' run: | - pip install escli-tool==0.2.1 + pip install escli-tool==0.2.2 - name: Collect pr info from vllm-project/vllm-ascend if: github.event_name != 'pull_request' @@ -177,17 +183,17 @@ jobs: echo "vllm branch: ${{ matrix.vllm_branch }}" echo "vllm-ascend branch: ${{ matrix.vllm_ascend_branch }}" echo "------------------------" + cd /github/home bash benchmarks/scripts/run-performance-benchmarks.sh # send the result to es - if [[ "${{ github.event_name }}" != "pull request" ]]; then - escli add --vllm_branch ${{ matrix.vllm_branch }} \ - --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \ - --commit_id $commit_id \ - --commit_title "$commit_title" \ - --created_at "$commit_time_no_tz" \ - --res_dir ./benchmarks/results - rm -rf ./benchmarks/results - fi + escli add --vllm_branch ${{ matrix.vllm_branch }} \ + --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \ + --commit_id $commit_id \ + --commit_title "$commit_title" \ + --created_at "$commit_time_no_tz" \ + --res_dir ./benchmarks/results \ + --extra_feat '{"VLLM_USE_V1": "${{ matrix.vllm_use_v1 }}"}' + rm -rf ./benchmarks/results cd - done < commit_log.txt diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json index 576ced213a..40cec4c5f6 100644 --- a/benchmarks/tests/latency-tests.json +++ b/benchmarks/tests/latency-tests.json @@ -9,5 +9,15 @@ "num_iters_warmup": 5, "num_iters": 15 } + }, + { + "test_name": "latency_qwen2_5_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } } ] diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json index d8ad2be2b5..c8d5cdaf99 100644 --- a/benchmarks/tests/serving-tests.json +++ b/benchmarks/tests/serving-tests.json @@ -49,5 +49,29 @@ "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } + }, + { + "test_name": "serving_qwen2_5_7B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } } ] diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json index 551d23882e..3698e69f3f 100644 --- a/benchmarks/tests/throughput-tests.json +++ b/benchmarks/tests/throughput-tests.json @@ -22,6 +22,17 @@ "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", "num_prompts": 200 } + }, + { + "test_name": "throughput_qwen2_5_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } } ] From 37f4469a033b58f958a9281847f66291ec453a36 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Thu, 12 Jun 2025 10:47:30 +0800 Subject: [PATCH 032/153] [CI][Benchmark] Add qwen2.5-7b test (#1104) ### What this PR does / why we need it? - Add qwen2.5-7b performance benchmark, this is a sub pr of #1099, for v1 test, need more verify - Fix get commit time after checkout --------- Signed-off-by: wangli --- .github/workflows/nightly_benchmarks.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 27ddb6571b..b060076c9c 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -170,10 +170,10 @@ jobs: while IFS= read -r line || [[ -n "$line" ]]; do commit_id=${line%% *} commit_title=${line#* } - commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict) - commit_time_no_tz=${commit_time::19} git checkout $commit_id + commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict) + commit_time_no_tz=${commit_time::19} pip install -e . echo "------------------------" From c6e2a5fb4014b863cee6abc3009f5bc5340c9e88 Mon Sep 17 00:00:00 2001 From: wangyanhui-cmss Date: Thu, 12 Jun 2025 19:40:58 +0800 Subject: [PATCH 033/153] [fix] fix bug in 1p1d disaggregated_prefill example (#1184) ### What this PR does / why we need it? fix bug in 1p1d disaggregated_prefill example ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Tested with python find_device_ips.py and run disaggregated_prefill example Signed-off-by: wangyanhui-cmss --- .../disaggregated_prefill/find_device_ips.py | 58 ++++++++++--------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/examples/disaggregated_prefill/find_device_ips.py b/examples/disaggregated_prefill/find_device_ips.py index 205afbf789..48fd7b9d32 100644 --- a/examples/disaggregated_prefill/find_device_ips.py +++ b/examples/disaggregated_prefill/find_device_ips.py @@ -30,38 +30,40 @@ HCCN_TOOL_PATH = envs.HCCN_PATH -def get_device_ips(world_size: int): - npu_info = subprocess.run( - ["npu-smi", "info", "-m"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - ) +def get_device_ips(): + npu_info = subprocess.run(['npu-smi', 'info', '-m'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True) if npu_info.returncode != 0 or not os.path.exists(HCCN_TOOL_PATH): raise RuntimeError("No npu-smi/hccn_tool tools provided for NPU.") - npu_start_idx = int( - re.match(r".*\n\t([0-9]+).*", - npu_info.stdout).group(1)) # type: ignore + + # ‌Extract NPU IDs for all Ascend devices (excluding Mcu rows) + device_ids = [] + for line in npu_info.stdout.strip().split('\n'): + match = re.match(r'^\s*(\d+)\s+\d+\s+\d+\s+Ascend', line) + if match: + device_ids.append(int(match.group(1))) + + if not device_ids: + raise RuntimeError( + "Cannot parse any valid device ID from npu-smi output.") + device_ip_list = [] - for ip_offset in range(world_size): - cmd = [ - HCCN_TOOL_PATH, - "-i", - f"{npu_start_idx + ip_offset}", - "-ip", - "-g", - ] - device_ip_info = subprocess.run( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - ) - device_ip = re.match(r"ipaddr:(.*)\n", - device_ip_info.stdout).group(1) # type: ignore + for device_id in device_ids: + cmd = [HCCN_TOOL_PATH, '-i', str(device_id), '-ip', '-g'] + device_ip_info = subprocess.run(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True) + ip_match = re.search(r'ipaddr:(.*)', device_ip_info.stdout) + if not ip_match: + raise RuntimeError( + f"Cannot parse IP from hccn_tool for device {device_id}") + device_ip = ip_match.group(1).strip() device_ip_list.append(device_ip) + return device_ip_list -# Pass number of NPUs into this function. -print(get_device_ips(8)) +print(get_device_ips()) From 55c0e68883bea7241e0b4ed445ff4a57f79f692f Mon Sep 17 00:00:00 2001 From: Wan_Danfeng Date: Thu, 12 Jun 2025 21:22:23 +0800 Subject: [PATCH 034/153] [Doc] Add Referer header for CANN package download url. (#1192) ### What this PR does / why we need it? fix the CANN download url ### Does this PR introduce _any_ user-facing change? no, do not have any user-facing change ### How was this patch tested? run the **wget** command and cann package is rightly downloaded. --------- Signed-off-by: wan_danfeng --- docs/source/installation.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/installation.md b/docs/source/installation.md index 8fbe1825dd..c9f6844915 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -78,17 +78,17 @@ source vllm-ascend-env/bin/activate pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple attrs 'numpy<2.0.0' decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions # Download and install the CANN package. -wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run chmod +x ./Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run ./Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run --full source /usr/local/Ascend/ascend-toolkit/set_env.sh -wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run chmod +x ./Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run ./Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run --install -wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run chmod +x ./Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run ./Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run --install From e72f94e38f4208d36a20df1ff85ad4abf430447d Mon Sep 17 00:00:00 2001 From: sdmyzlp <117554856+sdmyzlp@users.noreply.github.com> Date: Thu, 12 Jun 2025 21:42:09 +0800 Subject: [PATCH 035/153] Support multistream of MLA vector operations (#1135) ### What this PR does / why we need it? Move all vector operations to a secondary stream, with the expected overlaping being: ``` | q_rmsnorm | | kv_norm_rope_cache | | q_rope | | matmul W_DQ | matmul W_DKV | index | index | matmul W_UQ | split | matmul W_KV_T | ``` Currently, the `IndexByTensor` operators introduced by computation of `cos` and `sin` can't be offloaded to the secondary stream due to a known bug of graph fusion optimization pass. So we instead keep it in the main stream, only requires it be computed before `matmul W_UQ` to avoid hindering later overlapping. The problem may be solved by later optimization (#993), which hoists the computation of `cos` and `sin` up to the first layer. ### Does this PR introduce _any_ user-facing change? Controlled by `torchair_graph_config.enable_multistream_mla`, defaulted to False. ### How was this patch tested? Tested on 1x16 910 node, with tailored 2 layer DSKv2. Signed-off-by: sdmyzlp --- docs/source/user_guide/additional_config.md | 1 + tests/singlecard/test_ascend_config.py | 2 + vllm_ascend/ascend_config.py | 2 + vllm_ascend/attention/mla_v1.py | 56 ++++++++++++++------- vllm_ascend/models/deepseek_v2.py | 14 +++++- 5 files changed, 56 insertions(+), 19 deletions(-) diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index 90002dbb91..4608326ecd 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -39,6 +39,7 @@ The details of each config option are as follows: | Name | Type | Default | Description | | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable torchair graph mode | +| `enable_multistream_mla`| bool | `False` | Whether to put vector ops of MLA to another stream | | `enable_multistream_moe`| bool | `False` | Whether to enable multistream shared expert | | `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization | | `use_cached_graph` | bool | `False` | Whether to use cached graph | diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py index 818745f308..63484d4a01 100644 --- a/tests/singlecard/test_ascend_config.py +++ b/tests/singlecard/test_ascend_config.py @@ -59,6 +59,7 @@ def test_run_with_ascend_config(): "graph_batch_sizes": [1, 2, 4, 8], "graph_batch_sizes_init": False, "enable_multistream_moe": True, + "enable_multistream_mla": True, }, "ascend_scheduler_config": { "enabled": True, @@ -79,6 +80,7 @@ def test_run_with_ascend_config(): 1, 2, 4, 8 ] assert not ascend_config.torchair_graph_config.graph_batch_sizes_init + assert ascend_config.torchair_graph_config.enable_multistream_mla assert ascend_config.torchair_graph_config.enable_multistream_moe assert ascend_config.ascend_scheduler_config.enabled assert ascend_config.ascend_scheduler_config.enable_chunked_prefill diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index f7441f5f82..2d3428327f 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -54,6 +54,8 @@ def __init__(self, torchair_graph_config): "graph_batch_sizes", []) self.graph_batch_sizes_init = torchair_graph_config.get( "graph_batch_sizes_init", False) + self.enable_multistream_mla = torchair_graph_config.get( + "enable_multistream_mla", False) self.enable_multistream_moe = torchair_graph_config.get( "enable_multistream_moe", False) self.enable_view_optimize = torchair_graph_config.get( diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 9e10815fbe..e07d59a309 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -19,6 +19,7 @@ from vllm_ascend.multistream.context import get_multistream_comm_context from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla +from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput @@ -481,6 +482,9 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz + self.enable_multistream_mla = \ + ascend_config.torchair_graph_config.enable_multistream_mla + # Adapt torch air graph mode with spec decoding. speculative_config = get_current_vllm_config().speculative_config if speculative_config is not None: @@ -664,17 +668,20 @@ def exec_kv( # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) cache_mode = "PA_NZ" if self.enable_kv_nz else "PA" - k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache( - kv, - self.kv_a_layernorm.weight, - cos, - sin, - slots.to(torch.int64), - kv_cache[1], - kv_cache[0], - epsilon=self.kv_a_layernorm.variance_epsilon, - cache_mode=cache_mode, - ) + with npu_stream_switch("mla_secondary", + 0, + enabled=self.enable_multistream_mla): + k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache( + kv, + self.kv_a_layernorm.weight, + cos, + sin, + slots.to(torch.int64), + kv_cache[1], + kv_cache[0], + epsilon=self.kv_a_layernorm.variance_epsilon, + cache_mode=cache_mode, + ) return k_pe, k_nope def exec_kv_prefill( @@ -867,23 +874,38 @@ def forward( if has_decode: decode_k_nope = None assert attn_metadata.decode is not None - decode_ql_nope, decode_q_pe = \ - self._q_proj_and_k_up_proj(decode_hs_or_q_c) if self.running_in_graph: seq_len = self.rotary_emb.max_position_embeddings cos = self.rotary_emb.cos_cached[:seq_len].to( - dtype=decode_q_pe.dtype) + dtype=decode_hs_or_q_c.dtype) sin = self.rotary_emb.sin_cached[:seq_len].to( - dtype=decode_q_pe.dtype) + dtype=decode_hs_or_q_c.dtype) cos = cos[attn_metadata.decode.input_positions] sin = sin[attn_metadata.decode.input_positions] cos = cos[:, None, None, :] sin = sin[:, None, None, :] - - decode_q_pe = self.rope_single(decode_q_pe, cos, sin) + # Without explicitly controlling the order, IndexByTensor operations + # would be placed after `matmul W_KV_T` hindering the overlapping of + # KvRmsNormRopeCache and SingleRope. + npu_wait_tensor(decode_hs_or_q_c, + cos, + enabled=self.enable_multistream_mla) + npu_wait_tensor(decode_hs_or_q_c, + sin, + enabled=self.enable_multistream_mla) + decode_ql_nope, decode_q_pe = \ + self._q_proj_and_k_up_proj(decode_hs_or_q_c) + if self.running_in_graph: decode_k_pe, decode_k_nope = self.exec_kv( hidden_states_or_kv_c_normed, cos, sin, kv_cache, attn_metadata.slot_mapping) + with npu_stream_switch("mla_secondary", + 0, + enabled=self.enable_multistream_mla): + npu_wait_tensor(decode_q_pe, + decode_k_pe, + enabled=self.enable_multistream_mla) + decode_q_pe = self.rope_single(decode_q_pe, cos, sin) else: decode_q_pe[...], decode_k_pe[...] = self.rotary_emb( attn_metadata.decode.input_positions, diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index a83ca47510..0ae1142a3d 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -71,7 +71,8 @@ from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.quantization.quant_config import AscendLinearMethod from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod -from vllm_ascend.utils import dispose_tensor +from vllm_ascend.utils import (dispose_tensor, npu_stream_switch, + npu_wait_tensor) VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 @@ -496,6 +497,8 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_multistream_mla = \ + ascend_config.torchair_graph_config.enable_multistream_mla def forward( self, @@ -505,7 +508,14 @@ def forward( attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: if self.q_lora_rank is not None: ckq = self.q_a_proj(hidden_states)[0] - hidden_states_or_q_c = self.q_a_layernorm(ckq) + use_multistream_mla = (self.enable_multistream_mla + and attn_metadata is not None + and attn_metadata.num_decodes > 0) + npu_wait_tensor(hidden_states, ckq, enabled=use_multistream_mla) + with npu_stream_switch("mla_secondary", + 0, + enabled=use_multistream_mla): + hidden_states_or_q_c = self.q_a_layernorm(ckq) else: hidden_states_or_q_c = hidden_states if self.torchair_graph_enabled: From 47b507b18096079c039c96018bb39eadc625d472 Mon Sep 17 00:00:00 2001 From: whx <56632993+whx-sjtu@users.noreply.github.com> Date: Fri, 13 Jun 2025 07:51:23 +0800 Subject: [PATCH 036/153] [CI] Recover ut for ascend scheduler only in ci of v1. (#1180) Last PR [#943 ](https://github.com/vllm-project/vllm-ascend/pull/943) wrongly open ut of AscendScheduler in V0 ci, this PR fixes this problem and only run ut of it in V1 ci. Signed-off-by: whx-sjtu <2952154980@qq.com> --- .github/workflows/vllm_ascend_test.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 30fbf0d5e3..f71e83205f 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -185,8 +185,8 @@ jobs: # test_ascend_config.py should be ran separately because it will regenerate the global config many times. pytest -sv tests/singlecard/test_ascend_config.py pytest -sv tests/singlecard/test_camem.py - # pytest -sv tests/singlecard/core/test_ascend_scheduler.py - # pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py + pytest -sv tests/singlecard/core/test_ascend_scheduler.py + pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py pytest -sv tests/singlecard/ \ --ignore=tests/singlecard/test_offline_inference.py \ --ignore=tests/singlecard/test_guided_decoding.py \ @@ -217,7 +217,6 @@ jobs: # test_ascend_config.py should be ran separately because it will regenerate the global config many times. pytest -sv tests/singlecard/test_ascend_config.py pytest -sv tests/singlecard/test_prompt_embedding.py - pytest -sv tests/singlecard/core/test_ascend_scheduler.py pytest -sv tests/singlecard/ \ --ignore=tests/singlecard/test_offline_inference.py \ --ignore=tests/singlecard/test_guided_decoding.py \ From 94a52cf577f974e71c3e8c9315886cec70153960 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Fri, 13 Jun 2025 18:25:50 +0800 Subject: [PATCH 037/153] Add ShouJian Zheng (@jianzs) as vLLM Ascend maintainer (#1203) ### What this PR does / why we need it? Add @jianzs as vLLM Ascend maintainer @jianzs ---- I would like to nominate Shoujian Zheng (@jianzs ) as a maintainer, starting with my +1. - He focuses on the code quality and good design with solid reviews in P/D disaggregation and DeepSeek improvement area about 30+ high quality review, such as #issuecomment-2811764833, #discussion_r2069927605 and #pullrequestreview-2820996674. This is the most important reason why I nominated him, because helping community developers complete PRs with high quality and continuously ensure the quality of codebase is one of the important responsibilities of a maintainer. We believe he is a great addition. - Shoujian's main expertise is distributed inference. He has a lot of experience in production about AI infra. He has very good habits and explains in great detail all changes #issue-3023082580 anqd share results open: #issuecomment-2853140443. And High quality PR: #706, #774, #852. - Community Involvement: Active involved in community discussion, he is collaborative and helps the users solve problems, involved in 30+ PR and issue, such as #issuecomment-2911934292 and #issuecomment-2833523571. Reference: [1] https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html [2] https://vllm-ascend.readthedocs.io/en/latest/community/governance.html Signed-off-by: Yikun Jiang --- docs/source/community/contributors.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/community/contributors.md b/docs/source/community/contributors.md index d61259de84..16ea600d65 100644 --- a/docs/source/community/contributors.md +++ b/docs/source/community/contributors.md @@ -7,6 +7,7 @@ | Xiyuan Wang| [@wangxiyuan](https://github.com/wangxiyuan) | 2025/01 | | Yikun Jiang| [@Yikun](https://github.com/Yikun) | 2025/02 | | Yi Gan| [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/02 | +| Shoujian Zheng| [@jianzs](https://github.com/jianzs) | 2025/06 | ## Contributors From c28f6cba4d1732204452ee8d82802bb4bbf17155 Mon Sep 17 00:00:00 2001 From: NPUDev Date: Tue, 10 Jun 2025 20:36:06 +0800 Subject: [PATCH 038/153] add eplb policy --- vllm_ascend/eplb/core/policy/__init__.py | 0 vllm_ascend/eplb/core/policy/eplb_policy.py | 42 +++++++++++++++++++ .../eplb/core/policy/mock_load_balance.py | 29 +++++++++++++ .../eplb/core/policy/policy_factory.py | 15 +++++++ 4 files changed, 86 insertions(+) create mode 100644 vllm_ascend/eplb/core/policy/__init__.py create mode 100644 vllm_ascend/eplb/core/policy/eplb_policy.py create mode 100644 vllm_ascend/eplb/core/policy/mock_load_balance.py create mode 100644 vllm_ascend/eplb/core/policy/policy_factory.py diff --git a/vllm_ascend/eplb/core/policy/__init__.py b/vllm_ascend/eplb/core/policy/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/vllm_ascend/eplb/core/policy/eplb_policy.py b/vllm_ascend/eplb/core/policy/eplb_policy.py new file mode 100644 index 0000000000..1de60c348d --- /dev/null +++ b/vllm_ascend/eplb/core/policy/eplb_policy.py @@ -0,0 +1,42 @@ +# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +from abc import abstractmethod + + +class DynamicConfig: + placement_policy = None + + max_transferred_expert_per_layer = 100 + # 一台机器上,一层最多搬运多少专家 + + ep_worldsize = 64 # 整个集群上所有的专家分布在多少个die上 + num_die_per_host = 8 # 每台机器上有几个die + + +class EplbPolicy: + def __init__(self, config: DynamicConfig): + self.config = config + + @abstractmethod + def rebalance_experts(self, current_expert_table, expert_workload): + """ + 传入weight并返回相关限制条件下的专家复制和放置 + INPUT: + current_expert_table: [layerId, rankId, expert_num_i] + expert_workload = expert_table[layer0][rankId][expert_num_i] + + RETURNED: (res, expert_table) + res: + 1 -- table_changed + 0 -- not_changed + + expert_table: [layerId, rankId, expert_num_i] + expert_num_i --- [0, MaxExpertPerRank] + expertID = expert_table[layer0][rankId][expert_num_i] + array_values: + [0, 1, 2, 3, 248] + [4, 5, 6, 7, 254] + [8, 9, 10, 11, 71] + ... + [252, 253, 254, 255, 0] + """ + pass diff --git a/vllm_ascend/eplb/core/policy/mock_load_balance.py b/vllm_ascend/eplb/core/policy/mock_load_balance.py new file mode 100644 index 0000000000..0ba2ca21ba --- /dev/null +++ b/vllm_ascend/eplb/core/policy/mock_load_balance.py @@ -0,0 +1,29 @@ +# Copyright # Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +import copy +import random +import torch +import torch + +from .eplb_policy import EplbPolicy, DynamicConfig + +random.seed(42) + +class MockLoadBalance(EplbPolicy): + def __init__(self, config: DynamicConfig): + super().__init__(config) + + def rebalance_experts(self, current_expert_table, expert_workload): + new_table = copy.deepcopy(current_expert_table) + num_layers = len(current_expert_table) + num_card = len(current_expert_table[0]) + + for i in range(num_layers): + # 随机选两个卡 + indices = random.sample(range(num_card), 2) + + # 交换冗余专家 + expert_id_to_exchange = new_table[i][indices[0]][-1].clone() + new_table[i][indices[0]][-1] = new_table[i][indices[1]][-1] + new_table[i][indices[1]][-1] = expert_id_to_exchange + + return 1, [-i for i in range(num_layers)], new_table \ No newline at end of file diff --git a/vllm_ascend/eplb/core/policy/policy_factory.py b/vllm_ascend/eplb/core/policy/policy_factory.py new file mode 100644 index 0000000000..d06d1ce634 --- /dev/null +++ b/vllm_ascend/eplb/core/policy/policy_factory.py @@ -0,0 +1,15 @@ +# Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +from .eplb_policy import EplbPolicy, DynamicConfig +from .mock_load_balance import MockLoadBalance +from .mock_dynamic_ep import DynamicEP + + + +class PolicyFactory: + @staticmethod + def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy: + policy = { + 0:MockLoadBalance , # MockLoadBalance + 1:DynamicEP, # DynamicEP + } + return policy.get(policy_type, MockLoadBalance)(config) From 839cab1ab67bc02e46c77b372d9338f12b162e9e Mon Sep 17 00:00:00 2001 From: NPUDev Date: Tue, 10 Jun 2025 20:36:15 +0800 Subject: [PATCH 039/153] add eplb updator --- vllm_ascend/eplb/eplb_updator.py | 148 +++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 vllm_ascend/eplb/eplb_updator.py diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py new file mode 100644 index 0000000000..99cd42255b --- /dev/null +++ b/vllm_ascend/eplb/eplb_updator.py @@ -0,0 +1,148 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +import torch +import torch.distributed as dist +from multiprocessing import Queue, Manager + +from vllm.logger import logger +from vllm_ascend.eplb.core.worker.eplb_worker import EplbProcess +from vllm_ascend.eplb.core.loader.device_transfer_loader import D2DExpertWeightLoader + +class EplbUpdator: + + def __init__(self): + self.init_eplb() + + def set_adaptor(self, adaptor): + self.adaptor = adaptor + self.eplb_loader = D2DExpertWeightLoader(eplb_adaptor=self.adaptor) + + def init_eplb(self): + + self.num_iterations: torch.int64 = 10 + + self.num_moe_layers = 2 + self.weight_update_counter = 0 + self.expert_map_initialized = False + self.update_in_flight = False + + self.reqs = [] + + self.cur_iterations: torch.int64 = 0 + + self.planner_block_queue = Queue() + self.block_update_queue = Queue(maxsize=1) + + self.manager = Manager() + self.shared_dict = self.manager.dict({ + # 当前rank_id的专家表[num_layers,num_experts] + "expert_map": None, + # 热度负载信息 [num_layers, world_size, num_experts] + "moe_load": None, + # 所有的专家表[num_layers, world_size, num_experts] + "expert_maps": None + }) + + self.eplb = EplbProcess( + shared_dict = self.shared_dict, + planner_q = self.planner_block_queue, + block_update_q = self.block_update_queue, + policy_type = 0, + enable_d2d = True + ) + + self.eplb_process = self.eplb._launch_process() + + logger.info(f"[ModelRunner] Launched EPLB process (pid={self.eplb_process.pid})") + + + def get_update_iteration(self): + self.cur_iterations = self.cur_iterations + 1 + return self.cur_iterations % self.num_iterations == 0 + + def get_init_expert_map(self): + try: + if not self.expert_map_initialized: + self.shared_dict["expert_maps"] = self.adaptor.get_init_expert_map(self.num_moe_layers) + self.expert_map_initialized = True + except Exception as e: + logger.warning(f"[ModelRunner] Failed to wake EPLB process: {e}", exc_info=True) + + def wakeup_eplb_worker(self): + self.planner_block_queue.put(1) + + def forward_before(self): + if self.update_in_flight and self.weight_update_counter < self.num_moe_layers: + (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.block_update_queue.get() + rank_id = torch.distributed.get_rank() + expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] + expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] + # TODO: layer_id + 3 should be replaced by configuration + self.eplb_loader.generate_expert_d2d_transfer_task(expert_send_info_this_rank, + expert_recv_info_this_rank, updated_expert_map[rank_id], layer_id + 3) + self.weight_update_counter += 1 + if self.weight_update_counter == self.num_moe_layers: + self.update_in_flight = False + + # set asynchronous stream for d2d expert weight update + self.reqs = [] + self.eplb_loader.asyn_expert_weight_transfer(self.reqs) + + def forward_end(self): + + self.get_init_expert_map() + + if not self.update_in_flight and self.get_update_iteration(): + moe_load = self.compute_and_set_moe_load() + self.wakeup_eplb_worker() + self.update_in_flight = True + + self.eplb_loader.update_expert_map_and_weight(self.reqs) + + def compute_and_set_moe_load(self): + local_load = self.adaptor.get_rank_expert_workload(self.num_moe_layers) + + self._gather_buffer = None + if dist.is_initialized(): + self.world_size = dist.get_world_size() + if dist.is_initialized(): + device = local_load.device + if self._gather_buffer is None: + shape = (self.world_size, *local_load.shape) + self._gather_buffer = torch.empty(shape, + dtype=local_load.dtype, + device=device) + + dist.all_gather_into_tensor(self._gather_buffer, local_load) + + moe_load = self._gather_buffer.permute(1, 0, 2).contiguous() + self.shared_dict["moe_load"] = moe_load.cpu() + logger.debug(f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}") + else: + moe_load = local_load.unsqueeze(1) + self.shared_dict["moe_load"] = moe_load.cpu() + logger.debug(f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}") + return moe_load + + def shutdown(self): + """ + Clean up the EPLB process. + """ + if self.eplb_process.is_alive(): + self.eplb_process.terminate() + self.eplb_process.join() + logger.info("[ModelRunner] EPLB process terminated") From 6ad801d59036d235412db70f2313a01562c89572 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Tue, 10 Jun 2025 20:37:15 +0800 Subject: [PATCH 040/153] implementation of VllmEplbAdaptor and D2DExpertWeightLoader --- vllm_ascend/eplb/adaptor/abstract_adaptor.py | 39 +++++ vllm_ascend/eplb/adaptor/vllm_adaptor.py | 90 +++++++++++ .../eplb/core/loader/abstract_loader.py | 24 +++ .../core/loader/device_transfer_loader.py | 141 ++++++++++++++++++ 4 files changed, 294 insertions(+) create mode 100644 vllm_ascend/eplb/adaptor/abstract_adaptor.py create mode 100644 vllm_ascend/eplb/adaptor/vllm_adaptor.py create mode 100644 vllm_ascend/eplb/core/loader/abstract_loader.py create mode 100644 vllm_ascend/eplb/core/loader/device_transfer_loader.py diff --git a/vllm_ascend/eplb/adaptor/abstract_adaptor.py b/vllm_ascend/eplb/adaptor/abstract_adaptor.py new file mode 100644 index 0000000000..8513b69ea0 --- /dev/null +++ b/vllm_ascend/eplb/adaptor/abstract_adaptor.py @@ -0,0 +1,39 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from abc import ABC, abstractmethod + +class EplbAdaptor(): + + def __init__(self, **args): + pass + + @abstractmethod + def get_rank_expert_workload(self, num_moe_layers): + raise NotImplementedError + + @abstractmethod + def get_init_expert_map(self): + raise NotImplementedError + + @abstractmethod + def do_update_expert_map(self): + raise NotImplementedError + + @abstractmethod + def do_update_expert_weight(self): + raise NotImplementedError \ No newline at end of file diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py new file mode 100644 index 0000000000..f4241a01fa --- /dev/null +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -0,0 +1,90 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import torch +import torch.distributed as dist + +from vllm_ascend.eplb.adaptor.abstract_adaptor import EplbAdaptor +from vllm.logger import logger + +class VllmEplbAdaptor(EplbAdaptor): + + def __init__(self, model, **args): + super().__init__(**args) + self.model = model + self.param_dict = dict(self.model.named_parameters()) + + # TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 is supported here + self.expert_weight_names = ["w13_weight", "w2_weight", "w13_weight_scale", "w13_weight_offset", + "w2_weight_scale", "w2_weight_offset"] + + self.buffer_tensor_dict = dict() + num_buffer_tensor = 100 # TO DO: provide number of buffer tensor by vllm configuration + params_dtype = torch.int8 # TO DO: provide number of buffer tensor by vllm configuration + self.init_buffer_tensor_dict(num_buffer_tensor, params_dtype) + + self.expert_map_per_layer = dict() + num_moe_layers = 2 # TO DO: provide number of num_moe_layers by vllm configuration + for layer_idx in range(num_moe_layers): + self.expert_map_per_layer[3 + layer_idx] = self.model.get_expert_map(3 + layer_idx) + + def init_buffer_tensor_dict(self, num_buffer_tensor, params_dtype): + for name in self.expert_weight_names: + complete_name = "model.layers.3.mlp.experts." + name + expert_tensor = self.param_dict[complete_name].data[0:num_buffer_tensor] + self.buffer_tensor_dict[name] = torch.empty_like(expert_tensor) + + def get_buffer_tensor(self, buffer_tensor_id): + for name in self.expert_weight_names: + yield self.buffer_tensor_dict[name][buffer_tensor_id] + + def get_expert_tensor(self, layer_id, global_expert_id_to_send): + for name in self.expert_weight_names: + complete_name = "model.layers." + str(layer_id) + ".mlp.experts." + name + local_expert_id = self.expert_map_per_layer[layer_id][global_expert_id_to_send].item() + yield self.param_dict[complete_name].data[local_expert_id] + + def get_rank_expert_workload(self, num_moe_layers): + return self.model.get_all_moe_loads(num_moe_layers) + + def get_init_expert_map(self, num_moe_layers): + expert_map = self.model.get_all_expert_map(num_moe_layers) + if dist.is_initialized(): + world_size = dist.get_world_size() + + rank = dist.get_rank() + + tensor_list = [ + torch.zeros_like(expert_map) for _ in range(world_size) + ] + + dist.all_gather(tensor_list, expert_map) + gathered = torch.stack(tensor_list, dim=0) + all_maps = gathered.permute(1, 0, 2).contiguous() + + all_expert_maps = all_maps.to(torch.device("cpu")) + return all_expert_maps + + def do_update_expert_map(self, layer_id, updated_expert_map): + self.expert_map_per_layer[layer_id].copy_(updated_expert_map) + + def do_update_expert_weight(self, layer_id, expert_id_before_replace, buffer_tensor_id): + for name in self.expert_weight_names: + complete_name = "model.layers." + str(layer_id) + ".mlp.experts." + name + local_expert_id = self.expert_map_per_layer[layer_id][expert_id_before_replace].item() + expert_tensor = self.param_dict[complete_name].data[local_expert_id] + expert_tensor.copy_(self.buffer_tensor_dict[name][buffer_tensor_id]) \ No newline at end of file diff --git a/vllm_ascend/eplb/core/loader/abstract_loader.py b/vllm_ascend/eplb/core/loader/abstract_loader.py new file mode 100644 index 0000000000..b1bef11c5d --- /dev/null +++ b/vllm_ascend/eplb/core/loader/abstract_loader.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from abc import ABC, abstractmethod + +class ExpertWeightLoader: + + @abstractmethod + def load_impl(self, old_expert_table, new_expert_table): + raise NotImplementedError \ No newline at end of file diff --git a/vllm_ascend/eplb/core/loader/device_transfer_loader.py b/vllm_ascend/eplb/core/loader/device_transfer_loader.py new file mode 100644 index 0000000000..ea48798670 --- /dev/null +++ b/vllm_ascend/eplb/core/loader/device_transfer_loader.py @@ -0,0 +1,141 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +import torch +import torch.distributed as dist +from enum import Enum + +from vllm.logger import logger +from vllm_ascend.eplb.core.loader.abstract_loader import ExpertWeightLoader + +class ExpertWeightUpdateState(Enum): + WAITING = 0 # waiting for updated expert_map by EplbWorker + READY = 1 # ready for d2d expert weights updating + TRANSFERING = 2 # d2d finished and waiting for updating expert_map into model + +class D2DExpertWeightLoader(ExpertWeightLoader): + + def __init__(self, eplb_adaptor): + self.comm_op_list = None + self.eplb_adaptor = eplb_adaptor + + self.updated_expert_map = None + self.layer_id = -1 # layer id to be updated + self.state = ExpertWeightUpdateState.WAITING + self.recv_expert_list = [] + self.mock_flag = True + + def generate_expert_d2d_transfer_task(self, expert_send_info, expert_recv_info, + updated_expert_map, layer_id): + # When current send/recv and weight.expert_map update tasks are not finished, cannot accept new d2d task + if self.state != ExpertWeightUpdateState.WAITING: + logger.error("current d2d weight update tasks are on-going, cannot accept new weight update task") + return + + # If neither send nor receive task is needed for this layer on this rank, return + if not (expert_send_info or expert_recv_info): + return + + self.updated_expert_map = updated_expert_map + + self.layer_id = layer_id + self.comm_op_list = [] + for send_info in expert_send_info: + dst_rank, global_expert_id_to_send = send_info + for src_tensor in self.eplb_adaptor.get_expert_tensor(layer_id, global_expert_id_to_send): + self.comm_op_list.append(dist.P2POp(dist.isend, src_tensor, dst_rank)) + + buffer_tensor_id = 0 + for recv_info in expert_recv_info: + recv_rank, global_expert_id_to_recv = recv_info + for buffer_tensor in self.eplb_adaptor.get_buffer_tensor(buffer_tensor_id): + self.comm_op_list.append(dist.P2POp(dist.irecv, buffer_tensor, recv_rank)) + self.recv_expert_list.append((global_expert_id_to_recv, buffer_tensor_id)) + buffer_tensor_id += 1 + + self.state = ExpertWeightUpdateState.READY + + def asyn_expert_weight_transfer(self, reqs): + # Only when send/recv tasks are parsed into self.comm_op_list, d2d send/recv tasks can be luanched + if self.state != ExpertWeightUpdateState.READY: + return + + # set asynchronous stream for d2d expert weight transfer + if self.comm_op_list: + reqs = dist.batch_isend_irecv(self.comm_op_list) + + self.state = ExpertWeightUpdateState.TRANSFERING + + def update_expert_map_and_weight(self, reqs): + # Only after send/recv tasks have been luanched, expert_map and weight can be updated + if self.state != ExpertWeightUpdateState.TRANSFERING: + return + + # Waiting for send/recv tasks finish + for req in reqs: + req.wait() + + if self.comm_op_list is not None: + self.comm_op_list = None + + # update expert_map + self.eplb_adaptor.do_update_expert_map(self.layer_id, self.updated_expert_map) + + # update expert weight + buffer_tensor_id = 0 + for recv_expert_info in self.recv_expert_list: + global_expert_id_to_recv, buffer_tensor_id = recv_expert_info + self.eplb_adaptor.do_update_expert_weight(self.layer_id, global_expert_id_to_recv, buffer_tensor_id) + + self.recv_expert_list = [] + self.updated_expert_map = None + self.layer_id = -1 + self.state = ExpertWeightUpdateState.WAITING + + def generate_mock_update_info(self, rank_id): + if rank_id == 0: + expert_send_info = [(1, 0)] + expert_recv_info = [(1, 64)] + updated_expert_map_list = [-1] + [i for i in range(1, 64)] + [0] + [j for j in [-1] * 191] + updated_expert_map = torch.tensor(updated_expert_map_list) + layer_id = 3 + + if rank_id == 1: + expert_send_info = [(0, 64)] + expert_recv_info = [(0, 0)] + updated_expert_map_list = [0] + [k for k in [-1] * 63] + [i for i in range(1, 64)] + [j for j in [-1] * 129] + updated_expert_map = torch.tensor(updated_expert_map_list) + layer_id = 3 + + if rank_id == 2: + expert_send_info = [(3, 128)] + expert_recv_info = [(3, 192)] + updated_expert_map_list = [k for k in [-1] * 129] + [i for i in range(1, 64)] + [0] + [j for j in [-1] * 63] + updated_expert_map = torch.tensor(updated_expert_map_list) + layer_id = 3 + + if rank_id == 3: + expert_send_info = [(2, 192)] + expert_recv_info = [(2, 128)] + updated_expert_map_list = [k for k in [-1] * 128] + [0] + [k for k in [-1] * 64] + [i for i in range(1, 64)] + updated_expert_map = torch.tensor(updated_expert_map_list) + layer_id = 3 + + self.mock_flag = False + return (expert_send_info, expert_recv_info, updated_expert_map, layer_id) + + def load_impl(self, old_expert_table, new_expert_table): + raise NotImplementedError \ No newline at end of file From 34109ac232141df4116b1fcd052f05699872fa53 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Tue, 10 Jun 2025 21:12:11 +0800 Subject: [PATCH 041/153] determine num_dense_layers and num_moe_layers by refering to model configuration --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index f4241a01fa..531ff205eb 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -27,6 +27,8 @@ def __init__(self, model, **args): super().__init__(**args) self.model = model self.param_dict = dict(self.model.named_parameters()) + self.num_dense_layers = self.model.config.first_k_dense_replace + self.num_moe_layers = self.model.config.num_hidden_layers - self.num_dense_layers # TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 is supported here self.expert_weight_names = ["w13_weight", "w2_weight", "w13_weight_scale", "w13_weight_offset", @@ -34,17 +36,16 @@ def __init__(self, model, **args): self.buffer_tensor_dict = dict() num_buffer_tensor = 100 # TO DO: provide number of buffer tensor by vllm configuration - params_dtype = torch.int8 # TO DO: provide number of buffer tensor by vllm configuration - self.init_buffer_tensor_dict(num_buffer_tensor, params_dtype) + self.init_buffer_tensor_dict(num_buffer_tensor) self.expert_map_per_layer = dict() - num_moe_layers = 2 # TO DO: provide number of num_moe_layers by vllm configuration - for layer_idx in range(num_moe_layers): - self.expert_map_per_layer[3 + layer_idx] = self.model.get_expert_map(3 + layer_idx) + for layer_idx in range(self.num_moe_layers): + self.expert_map_per_layer[self.num_dense_layers + layer_idx] =\ + self.model.get_expert_map(self.num_dense_layers + layer_idx) - def init_buffer_tensor_dict(self, num_buffer_tensor, params_dtype): + def init_buffer_tensor_dict(self, num_buffer_tensor): for name in self.expert_weight_names: - complete_name = "model.layers.3.mlp.experts." + name + complete_name = "model.layers." + str(self.num_dense_layers) + ".mlp.experts." + name expert_tensor = self.param_dict[complete_name].data[0:num_buffer_tensor] self.buffer_tensor_dict[name] = torch.empty_like(expert_tensor) From c15f8a85a5aa2a96e6b0c6027cd9dc58d7268229 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Tue, 10 Jun 2025 21:25:42 +0800 Subject: [PATCH 042/153] EPLB add eplb_worker --- vllm_ascend/eplb/core/worker/eplb_worker.py | 261 ++++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 vllm_ascend/eplb/core/worker/eplb_worker.py diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py new file mode 100644 index 0000000000..581a509c96 --- /dev/null +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -0,0 +1,261 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import time +import numpy as np +import torch +import torch_npu +import logging +import torch.distributed as dist +from multiprocessing import Process, Queue, Manager +from abc import ABC, abstractmethod +from vllm.logger import logger + +from vllm_ascend.eplb.core.policy.policy_factory import PolicyFactory, DynamicConfig + + +class EplbWorker: + + def __init__(self, shared_dict, policy_type, enable_d2d: bool = True): + self.policy_type = policy_type + self.policy = PolicyFactory.generate_policy(policy_type, DynamicConfig()) + self.shared_dict = shared_dict + self.old_expert_maps = None + self.enable_d2d = enable_d2d + + def do_update(self): + # put data in to queue + # in process self.policy.generate_policy() + # get epxert table && tensor + + # async stream + # D2D + # H2D + + # Get initial expert_map + if self.old_expert_maps is None: + self.old_expert_maps = self.get_init_expert_maps() + self.num_local_experts = self.old_expert_maps.max() + 1 + + # Get MOE load information + load_info = self.fetch_and_sum_load_info() + if load_info is None: + return + + #根据负载信息,获取更新后的专家表 + load_info, old_placemet = self.global2local(load_info, self.old_expert_maps, self.num_local_experts) + changed, priority, new_placement = self.calculate_rebalance_experts(load_info, old_placemet) + + new_expert_maps = self.local2global(new_placement) + + logger.debug(f"[EPLB Process new_map differs, performing D2D") + + update_info = self.compose_expert_update_info(new_expert_maps, self.old_expert_maps) + self.old_expert_maps = new_expert_maps + logger.info("EPLB Process complete") + + return update_info + + # TODO: Here only expert weight exchange is considered, need to be extended to cover other weight update cases + def compose_expert_update_info(self, updated_expert_maps, current_expert_maps): + num_layers = current_expert_maps.shape[0] + num_ranks = current_expert_maps.shape[1] + num_experts = current_expert_maps.shape[2] + + for layer_id in range(num_layers): + updated_expert_maps_this_layer = updated_expert_maps[layer_id] + current_expert_maps_this_layer = current_expert_maps[layer_id] + + expert_send_info_this_layer = dict() + expert_recv_info_this_layer = dict() + + # Guard Clause: if there is no expert weight update, avoid subsequent processing + if torch.equal(updated_expert_maps_this_layer, current_expert_maps_this_layer): + yield (expert_send_info_this_layer, expert_recv_info_this_layer, updated_expert_maps_this_layer, layer_id) + + # Parse expert_ids each rank needs to receive from other ranks + dst_rank_indices, experts_to_recv = torch.where((current_expert_maps_this_layer == -1) \ + & (updated_expert_maps_this_layer != -1)) + + # Parse expert_ids each rank needs to send to other ranks + src_rank_indices, experts_to_send = torch.where((current_expert_maps_this_layer != -1) \ + & (updated_expert_maps_this_layer == -1)) + + for idx in range(len(dst_rank_indices)): + dst_rank_id = dst_rank_indices[idx].item() + expert_id = experts_to_recv[idx].item() + if dst_rank_id not in expert_recv_info_this_layer: + expert_recv_info_this_layer[dst_rank_id] = [] + + if not torch.isin(src_rank_indices, torch.tensor(expert_id)).any(): + # if expert_id are not sent out from any npu, it will be copied from one npu holding this expert + candidate_src_rank_indices = torch.where(current_expert_maps_this_layer[:, expert_id] != -1) + else: + candidate_src_rank_indices = src_rank_indices[experts_to_send == expert_id] + + #TODO: improve selection criterion of npu sending expert_id considering such as intra-node or inter-node... + src_rank_id = candidate_src_rank_indices[0].item() + if src_rank_id not in expert_send_info_this_layer: + expert_send_info_this_layer[src_rank_id] = [] + + expert_send_info_this_layer[src_rank_id].append((dst_rank_id, expert_id)) + expert_recv_info_this_layer[dst_rank_id].append((src_rank_id, expert_id)) + + yield (expert_send_info_this_layer, expert_recv_info_this_layer, updated_expert_maps_this_layer, layer_id) + + + def calculate_rebalance_experts(self, load_info, old_placement): + """ + 通过 policy 实例的 rebalance_experts 方法计算 new_map。 + """ + if self.old_expert_maps is None: + return False, None, None + + changed, priority, new_map = self.policy.rebalance_experts(old_placement, load_info) + return changed, priority, new_map + + def get_init_expert_maps(self): + """ + Read the initial expert_map from shared_dict. + """ + return self.shared_dict.get("expert_maps", None) + + def fetch_and_sum_load_info(self): + """ + Each time the subprocess is awakened, read the latest moe_load + (shape: [num_moe_layers, num_experts_per_layer]) from shared_dict. + """ + return self.shared_dict.get("moe_load", None) + + def update_expert_map(self, expert_maps): + + self.shared_dict["expert_maps"] = expert_maps + + def global2local(self, + workload: torch.Tensor, + placement: torch.Tensor, + E_local: int + ) -> tuple[torch.Tensor, torch.Tensor]: + + L, G, _ = placement.shape + device = placement.device + + wt_local = torch.full((L, G, E_local), + fill_value=-1, + dtype=workload.dtype, + device=device) + pt_local = torch.full((L, G, E_local), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement >= 0 + l_idx, g_idx, k_idx = valid.nonzero(as_tuple=True) + + slot_idx = placement[l_idx, g_idx, k_idx] + values = workload[l_idx, g_idx, k_idx] + + wt_local[l_idx, g_idx, slot_idx] = values + pt_local[l_idx, g_idx, slot_idx] = k_idx + + return wt_local, pt_local + + + def local2global(self, + placement_local: torch.Tensor + ) -> torch.Tensor: + + L, G, E_local = placement_local.shape + device = placement_local.device + + max_id = torch.max(placement_local) + E_global = (max_id + 1).item() if max_id >= 0 else 0 + + if E_global == 0: + return torch.empty((L, G, 0), dtype=torch.long, device=device) + + placement_global = torch.full((L, G, E_global), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement_local >= 0 + l_idx, g_idx, slot_idx = valid.nonzero(as_tuple=True) + gid_idx = placement_local[l_idx, g_idx, slot_idx] + + placement_global[l_idx, g_idx, gid_idx] = slot_idx + + return placement_global + + +class EplbProcess: + def __init__(self, shared_dict, planner_q, block_update_q, policy_type: int = 0, enable_d2d: bool = True): + """ + Args: + shared_dict: Cross-process shared dict returned by Manager().dict() + policy_type: Integer passed to PolicyFactory.generate_policy + enable_d2d: Whether to enable D2D loading + """ + self.shared_dict = shared_dict + self.policy_type = policy_type + self.enable_d2d = enable_d2d + self.planner_q = planner_q + self.block_update_q = block_update_q + + # Create EplbWorker instance + self.worker = EplbWorker(self.shared_dict, self.policy_type, self.enable_d2d) + + + def worker_process(self,planner_q,block_update_q): + """ + Subprocess entry: bind to specified NPU, loop waiting for planner_q to wake up, call do_update, then notify main process update is complete. + """ + while True: + try: + + planner_q.get() + + update_info = self.worker.do_update() + + print("update_info:", update_info) + + for (a,b,c,d) in update_info: + while True: + if not block_update_q.empty(): + continue + block_update_q.put((a,b,c,d)) + break + + print("EPLB Process complete") + + except Exception as e: + logger.warning(f"[EPLB subprocess Exiting due to error: {e}", exc_info=True) + break + + def _launch_process(self): + """ + Use spawn method to launch subprocess and return (planner_q, block_update_q, proc). + """ + proc = Process( + target=self.worker_process, + args=(self.planner_q,self.block_update_q), + daemon=True + ) + + proc.start() + return proc + From 1400ea3e0df9525a5d2003075c59a5860c48a0ae Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Tue, 10 Jun 2025 21:33:01 +0800 Subject: [PATCH 043/153] add ssd loader --- vllm_ascend/eplb/core/loader/ssd_loader.py | 295 +++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 vllm_ascend/eplb/core/loader/ssd_loader.py diff --git a/vllm_ascend/eplb/core/loader/ssd_loader.py b/vllm_ascend/eplb/core/loader/ssd_loader.py new file mode 100644 index 0000000000..2c00c13a85 --- /dev/null +++ b/vllm_ascend/eplb/core/loader/ssd_loader.py @@ -0,0 +1,295 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import os +import json +import re +import time +import queue +import math +import multiprocessing as mp +from dataclasses import dataclass +from typing import Dict, List, Tuple, Optional, Generator, Any + +import torch +import torch_npu +from safetensors import safe_open +from vllm_ascend.eplb.core.loader.abstract_loader import ExpertWeightLoader + + +def log_debug(msg: str): + print(f"[DEBUG] {msg}") + + +class SSDExpertWeightLoader: + + """ + Load all tensors that belong to one (layer, expert_id) pair, CPU only. + """ + + def __init__(self, load_config: Optional[dict] = None): + self.load_config = load_config or {} + self.counter_before_loading_weights = 0.0 + + + def _prepare_index( + self, + model_dir: str, + index_file: str + ) -> Dict[str, str]: + index_path = os.path.join(model_dir, index_file) + if not os.path.exists(index_path): + raise FileNotFoundError(index_path) + with open(index_path, "r", encoding="utf-8") as f: + data = json.load(f) + if "weight_map" not in data: + raise KeyError("weight_map missing in index file") + return data["weight_map"] + + + def load_expert_weight_from_ssd( + self, + model_dir: str, + layer_prefix: str, + expert_id: int, + index_file: str = "model.safetensors.index.json" + ) -> Dict[str, torch.Tensor]: + index_map = self._prepare_index(model_dir, index_file) + + # collect parameter names under the target expert + prefix = f"{layer_prefix}.experts.{expert_id}." + param_to_shard = { + name: shard for name, shard in index_map.items() + if name.startswith(prefix) + } + if not param_to_shard: + raise KeyError(f"expert {expert_id} not found under {layer_prefix}") + + # group by shard + shard_to_params: Dict[str, List[str]] = {} + for p, s in param_to_shard.items(): + shard_to_params.setdefault(s, []).append(p) + + result: Dict[str, torch.Tensor] = {} + for shard, params in shard_to_params.items(): + shard_path = os.path.join(model_dir, shard) + if not os.path.exists(shard_path): + raise FileNotFoundError(shard_path) + with safe_open(shard_path, framework="pt", device="cpu") as reader: + for p in params: + result[p] = reader.get_tensor(p) + return result + +class EplbLoaderProcess(mp.Process): + """ + Independent process for blocking SSD reads. + """ + + def __init__( + self, + model_dir: str, + req_q: mp.Queue, + res_q: mp.Queue, + quit_evt: mp.Event, + index_file: str = "quant_model_weight_w8a8_dynamic.safetensors.index.json" + ): + super().__init__(daemon=True) + self.model_dir = model_dir + self.req_q = req_q + self.res_q = res_q + self.quit_evt = quit_evt + self.index_file = index_file + self.loader = SSDExpertWeightLoader() + + # ---------- process loop -------------------------------------------------# + def run(self) -> None: + while not self.quit_evt.is_set(): + try: + job = self.req_q.get(timeout=0.1) + except queue.Empty: + continue + + if job["type"] == "shutdown": + break + + try: + job_id: int = job["job_id"] + layer_prefix, expert_id = job["layer_prefix"], job["expert_id"] + + tensors = self.loader.load_expert_weight_from_ssd( + self.model_dir, + layer_prefix=layer_prefix, + expert_id=expert_id, + index_file=self.index_file + ) + self.res_q.put((job_id, tensors, None)) + except Exception as exc: + self.res_q.put((job["job_id"], None, str(exc))) + + @staticmethod + def _rpc( + layer_prefix: str, + expert_id: int, + model_dir: str, + index_file: str, + req_q: mp.Queue, + res_q: mp.Queue, + timeout: float = 30.0 + ) -> Dict[str, torch.Tensor]: + job_id = time.time_ns() + req_q.put(dict( + type="load", + job_id=job_id, + layer_prefix=layer_prefix, + expert_id=expert_id + )) + t0 = time.time() + while True: + try: + jid, tensors, err = res_q.get(timeout=0.1) + except queue.Empty: + if time.time() - t0 > timeout: + raise RuntimeError("EPLB I/O RPC timeout") + continue + if jid != job_id: + res_q.put((jid, tensors, err)) + continue + if err: + raise RuntimeError(err) + return tensors + + +@dataclass +class BatchMeta: + layer_ids: List[int] + tensors: List[torch.Tensor] # CPU tensors in this batch + + +class EplbRebalanceLoader: + + def __init__( + self, + model_dir: str, + num_layers: int, + first_dense_layers: int, + experts_per_layer: int, + layer_group_size: int = 8, + index_file: str = "quant_model_weight_w8a8_dynamic.safetensors.index.json" + enable_subprocess: bool = True + ): + self.model_dir = model_dir + self.first_dense = first_dense_layers + self.num_moe_layers = num_layers - first_dense_layers + self.experts_per_layer = experts_per_layer + self.layer_group_size = layer_group_size + self.index_file = index_file + + self.cpu_buf: List[torch.Tensor] = [] + self.npu_buf: List[torch.Tensor] = [] + + # multi-process I/O + self.enable_mp = enable_subprocess + if self.enable_mp: + self.req_q: mp.Queue = mp.Queue() + self.res_q: mp.Queue = mp.Queue() + self.quit_evt = mp.Event() + self.proc = EplbLoaderProcess( + model_dir, + self.req_q, + self.res_q, + self.quit_evt + ) + self.proc.start() + + def _load_one_expert_cpu(self, layer_prefix: str, expert_id: int) -> Dict[str, torch.Tensor]: + if self.enable_mp: + return EplbLoaderProcess._rpc( + layer_prefix=layer_prefix, + expert_id=expert_id, + model_dir=self.model_dir, + index_file=index_file, + req_q=self.req_q, + res_q=self.res_q + ) + + loader = SSDExpertWeightLoader() + return loader.load_expert_weight_from_ssd( + self.model_dir, + layer_prefix=layer_prefix, + expert_id=expert_id + ) + + def load_batch(self, batch_idx: int, expert_ids: List[int]) -> BatchMeta: + + start_layer = batch_idx * self.layer_group_size + self.first_dense + end_layer = min(start_layer + self.layer_group_size, + self.first_dense + self.num_moe_layers) + + tensors: List[torch.Tensor] = [] + layer_ids: List[int] = list(range(start_layer, end_layer)) + + for lid in layer_ids: + prefix = f"model.layers.{lid}.mlp" + for eid in expert_ids: + layer_pref = prefix + loaded = self._load_one_expert_cpu(layer_pref, eid) + tensors.extend(loaded.values()) + # keep CPU buffer for later copy + self.cpu_buf = tensors + return BatchMeta(layer_ids, tensors) + + def host_to_npu(self): + if not self.cpu_buf: + raise RuntimeError("CPU buffer is empty, call load_batch first") + + if not self.npu_buf: + self.npu_buf = [t.npu() for t in self.cpu_buf] + else: + for dst, src in zip(self.npu_buf, self.cpu_buf): + dst.copy_(src) + + def shutdown(self): + if self.enable_mp: + self.quit_evt.set() + self.req_q.put(dict(type="shutdown")) + self.proc.join(timeout=2.0) + log_debug("EplbLoaderProcess terminated") + + +if __name__ == "__main__": + + index_file = 'quant_model_weight_w8a8_dynamic.safetensors.index.json' + + loader = EplbRebalanceLoader( + model_dir="/home/data/eplb_vllm/DeepSeek-V3-W8A8", + num_layers=24, + first_dense_layers=3, + experts_per_layer=4, + layer_group_size=4, + enable_subprocess=True, + index_file = index_file + ) + + # load batch 0 (layers 2-5) for experts 0 and 2 + meta = loader.load_batch(batch_idx=0, expert_ids=[0, 2]) + log_debug(f"CPU tensors in batch: {len(meta.tensors)}") + + # copy to NPU + loader.host_to_npu() + log_debug(f"NPU buffer tensors: {len(loader.npu_buf)}") + + loader.shutdown() From 28af3938cc93e595f94e5a5281be06f752dd30de Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Tue, 10 Jun 2025 22:02:51 +0800 Subject: [PATCH 044/153] EPLB moed load collect --- vllm_ascend/eplb/core/loader/ssd_loader.py | 295 --------------------- vllm_ascend/models/deepseek_v2.py | 33 +++ vllm_ascend/ops/fused_moe.py | 30 ++- vllm_ascend/quantization/w8a8_dynamic.py | 7 +- vllm_ascend/worker/model_runner_v1.py | 29 ++ 5 files changed, 95 insertions(+), 299 deletions(-) delete mode 100644 vllm_ascend/eplb/core/loader/ssd_loader.py diff --git a/vllm_ascend/eplb/core/loader/ssd_loader.py b/vllm_ascend/eplb/core/loader/ssd_loader.py deleted file mode 100644 index 2c00c13a85..0000000000 --- a/vllm_ascend/eplb/core/loader/ssd_loader.py +++ /dev/null @@ -1,295 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -import os -import json -import re -import time -import queue -import math -import multiprocessing as mp -from dataclasses import dataclass -from typing import Dict, List, Tuple, Optional, Generator, Any - -import torch -import torch_npu -from safetensors import safe_open -from vllm_ascend.eplb.core.loader.abstract_loader import ExpertWeightLoader - - -def log_debug(msg: str): - print(f"[DEBUG] {msg}") - - -class SSDExpertWeightLoader: - - """ - Load all tensors that belong to one (layer, expert_id) pair, CPU only. - """ - - def __init__(self, load_config: Optional[dict] = None): - self.load_config = load_config or {} - self.counter_before_loading_weights = 0.0 - - - def _prepare_index( - self, - model_dir: str, - index_file: str - ) -> Dict[str, str]: - index_path = os.path.join(model_dir, index_file) - if not os.path.exists(index_path): - raise FileNotFoundError(index_path) - with open(index_path, "r", encoding="utf-8") as f: - data = json.load(f) - if "weight_map" not in data: - raise KeyError("weight_map missing in index file") - return data["weight_map"] - - - def load_expert_weight_from_ssd( - self, - model_dir: str, - layer_prefix: str, - expert_id: int, - index_file: str = "model.safetensors.index.json" - ) -> Dict[str, torch.Tensor]: - index_map = self._prepare_index(model_dir, index_file) - - # collect parameter names under the target expert - prefix = f"{layer_prefix}.experts.{expert_id}." - param_to_shard = { - name: shard for name, shard in index_map.items() - if name.startswith(prefix) - } - if not param_to_shard: - raise KeyError(f"expert {expert_id} not found under {layer_prefix}") - - # group by shard - shard_to_params: Dict[str, List[str]] = {} - for p, s in param_to_shard.items(): - shard_to_params.setdefault(s, []).append(p) - - result: Dict[str, torch.Tensor] = {} - for shard, params in shard_to_params.items(): - shard_path = os.path.join(model_dir, shard) - if not os.path.exists(shard_path): - raise FileNotFoundError(shard_path) - with safe_open(shard_path, framework="pt", device="cpu") as reader: - for p in params: - result[p] = reader.get_tensor(p) - return result - -class EplbLoaderProcess(mp.Process): - """ - Independent process for blocking SSD reads. - """ - - def __init__( - self, - model_dir: str, - req_q: mp.Queue, - res_q: mp.Queue, - quit_evt: mp.Event, - index_file: str = "quant_model_weight_w8a8_dynamic.safetensors.index.json" - ): - super().__init__(daemon=True) - self.model_dir = model_dir - self.req_q = req_q - self.res_q = res_q - self.quit_evt = quit_evt - self.index_file = index_file - self.loader = SSDExpertWeightLoader() - - # ---------- process loop -------------------------------------------------# - def run(self) -> None: - while not self.quit_evt.is_set(): - try: - job = self.req_q.get(timeout=0.1) - except queue.Empty: - continue - - if job["type"] == "shutdown": - break - - try: - job_id: int = job["job_id"] - layer_prefix, expert_id = job["layer_prefix"], job["expert_id"] - - tensors = self.loader.load_expert_weight_from_ssd( - self.model_dir, - layer_prefix=layer_prefix, - expert_id=expert_id, - index_file=self.index_file - ) - self.res_q.put((job_id, tensors, None)) - except Exception as exc: - self.res_q.put((job["job_id"], None, str(exc))) - - @staticmethod - def _rpc( - layer_prefix: str, - expert_id: int, - model_dir: str, - index_file: str, - req_q: mp.Queue, - res_q: mp.Queue, - timeout: float = 30.0 - ) -> Dict[str, torch.Tensor]: - job_id = time.time_ns() - req_q.put(dict( - type="load", - job_id=job_id, - layer_prefix=layer_prefix, - expert_id=expert_id - )) - t0 = time.time() - while True: - try: - jid, tensors, err = res_q.get(timeout=0.1) - except queue.Empty: - if time.time() - t0 > timeout: - raise RuntimeError("EPLB I/O RPC timeout") - continue - if jid != job_id: - res_q.put((jid, tensors, err)) - continue - if err: - raise RuntimeError(err) - return tensors - - -@dataclass -class BatchMeta: - layer_ids: List[int] - tensors: List[torch.Tensor] # CPU tensors in this batch - - -class EplbRebalanceLoader: - - def __init__( - self, - model_dir: str, - num_layers: int, - first_dense_layers: int, - experts_per_layer: int, - layer_group_size: int = 8, - index_file: str = "quant_model_weight_w8a8_dynamic.safetensors.index.json" - enable_subprocess: bool = True - ): - self.model_dir = model_dir - self.first_dense = first_dense_layers - self.num_moe_layers = num_layers - first_dense_layers - self.experts_per_layer = experts_per_layer - self.layer_group_size = layer_group_size - self.index_file = index_file - - self.cpu_buf: List[torch.Tensor] = [] - self.npu_buf: List[torch.Tensor] = [] - - # multi-process I/O - self.enable_mp = enable_subprocess - if self.enable_mp: - self.req_q: mp.Queue = mp.Queue() - self.res_q: mp.Queue = mp.Queue() - self.quit_evt = mp.Event() - self.proc = EplbLoaderProcess( - model_dir, - self.req_q, - self.res_q, - self.quit_evt - ) - self.proc.start() - - def _load_one_expert_cpu(self, layer_prefix: str, expert_id: int) -> Dict[str, torch.Tensor]: - if self.enable_mp: - return EplbLoaderProcess._rpc( - layer_prefix=layer_prefix, - expert_id=expert_id, - model_dir=self.model_dir, - index_file=index_file, - req_q=self.req_q, - res_q=self.res_q - ) - - loader = SSDExpertWeightLoader() - return loader.load_expert_weight_from_ssd( - self.model_dir, - layer_prefix=layer_prefix, - expert_id=expert_id - ) - - def load_batch(self, batch_idx: int, expert_ids: List[int]) -> BatchMeta: - - start_layer = batch_idx * self.layer_group_size + self.first_dense - end_layer = min(start_layer + self.layer_group_size, - self.first_dense + self.num_moe_layers) - - tensors: List[torch.Tensor] = [] - layer_ids: List[int] = list(range(start_layer, end_layer)) - - for lid in layer_ids: - prefix = f"model.layers.{lid}.mlp" - for eid in expert_ids: - layer_pref = prefix - loaded = self._load_one_expert_cpu(layer_pref, eid) - tensors.extend(loaded.values()) - # keep CPU buffer for later copy - self.cpu_buf = tensors - return BatchMeta(layer_ids, tensors) - - def host_to_npu(self): - if not self.cpu_buf: - raise RuntimeError("CPU buffer is empty, call load_batch first") - - if not self.npu_buf: - self.npu_buf = [t.npu() for t in self.cpu_buf] - else: - for dst, src in zip(self.npu_buf, self.cpu_buf): - dst.copy_(src) - - def shutdown(self): - if self.enable_mp: - self.quit_evt.set() - self.req_q.put(dict(type="shutdown")) - self.proc.join(timeout=2.0) - log_debug("EplbLoaderProcess terminated") - - -if __name__ == "__main__": - - index_file = 'quant_model_weight_w8a8_dynamic.safetensors.index.json' - - loader = EplbRebalanceLoader( - model_dir="/home/data/eplb_vllm/DeepSeek-V3-W8A8", - num_layers=24, - first_dense_layers=3, - experts_per_layer=4, - layer_group_size=4, - enable_subprocess=True, - index_file = index_file - ) - - # load batch 0 (layers 2-5) for experts 0 and 2 - meta = loader.load_batch(batch_idx=0, expert_ids=[0, 2]) - log_debug(f"CPU tensors in batch: {len(meta.tensors)}") - - # copy to NPU - loader.host_to_npu() - log_debug(f"NPU buffer tensors: {len(loader.npu_buf)}") - - loader.shutdown() diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 0ae1142a3d..3bd92d5bc9 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -796,6 +796,39 @@ def forward( inputs_embeds) return hidden_states + def update_expert_map(self,new_expert_map,layer_id): + self.model.layers[layer_id].mlp.experts.update_map(new_expert_map) + + def update_all_expert_map(self, new_expert_map,num_moe_layers): + num_moe_layers = len(new_expert_map) + for layer_id in range(num_moe_layers): + layer_map = new_expert_map[layer_id].to("npu") + self.model.layers[3+layer_id].mlp.experts.update_map(layer_map) + + def get_expert_map(self,layer_id): + return self.model.layers[layer_id].mlp.experts.get_map() + + def get_all_expert_map(self,num_moe_layers): + all_loads = [] + for layer_id in range(num_moe_layers): + load_tensor = self.get_expert_map(3+layer_id) # (num_experts_per_layer,) + all_loads.append(load_tensor) + + return torch.stack(all_loads, dim=0) + + def get_moe_load(self,layer_id): + return self.model.layers[layer_id].mlp.experts.get_moe_load() + + def get_all_moe_loads(self, num_moe_layers) -> torch.Tensor: + """ + output: [num_moe_layers, num_experts_per_layer] + """ + all_loads = [] + for layer_id in range(num_moe_layers): + load_tensor = self.get_moe_load(3+layer_id) # (num_experts_per_layer,) + all_loads.append(load_tensor) + + return torch.stack(all_loads, dim=0) class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM): pass diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index d6115d35c1..31b9bca2ab 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1008,6 +1008,8 @@ def __init__( AscendFusedMoE.moe_counter += 1 self.moe_instance_id = AscendFusedMoE.moe_counter + self.moe_load = None + self.topk_ids = None if params_dtype is None: params_dtype = torch.get_default_dtype() @@ -1146,7 +1148,7 @@ def forward(self, hidden_states, router_logits) # Matrix multiply. - e_hidden_states = self.quant_method.apply( + e_hidden_states, self.topk_ids = self.quant_method.apply( layer=self, x=hidden_states, router_logits=router_logits, @@ -1173,6 +1175,10 @@ def forward(self, return e_hidden_states, shared_experts(hidden_states) else: return e_hidden_states + self.calculate_moe_load() + + # if self.enable_multistream_shared_expert and not is_prefill: + # hidden_states, shared_output = hidden_states if self.dp_size > 1: if VLLM_ENABLE_MC2 and not is_prefill: @@ -1226,3 +1232,25 @@ def _forward_ms_fused_moe_comp( enable_force_load_balance=enable_force_load_balance) return hidden_states + + def update_map(self,new_expert_map): + self.expert_map = new_expert_map + + def get_map(self): + return self.expert_map + + def get_moe_load(self): + return self.moe_load + + def calculate_moe_load(self): + if self.moe_load is None: + self.moe_load = torch.zeros(self.num_experts, + dtype=torch.int64, + device=self.topk_ids.device) + + ids = self.topk_ids.flatten().to(torch.int64) + + ones = torch.ones_like(ids, dtype=torch.int64, device=ids.device) + self.moe_load.scatter_add_(0, ids, ones) + + return self.moe_load \ No newline at end of file diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 66a0a302c8..351f9b9c98 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -674,7 +674,8 @@ def apply( moe_all_to_all_group_name=self.moe_all_to_all_group_name, log2phy=log2phy, global_redundant_expert_num=global_redundant_expert_num, - shared_experts=shared_experts) + shared_experts=shared_experts, + **kwargs), topk_ids elif self.torchair_graph_enabled or self.ep_group.world_size == 1: return fused_experts(hidden_states=x, w1=layer.w13_weight, @@ -684,7 +685,7 @@ def apply( topk_weights=topk_weights, topk_ids=topk_ids, top_k=top_k, - expert_map=expert_map) + expert_map=expert_map), topk_ids else: # The current implementation of deepseek moe splits hidden_states # according to tp_size before they are feed into fused_moe module. @@ -703,7 +704,7 @@ def apply( ep_group=self.ep_group, log2phy=log2phy, global_redundant_expert_num=global_redundant_expert_num, - ) + ), topk_ids def process_weights_after_loading(self, layer): if self.transpose_weight: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index c3587936f9..671bd3e023 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -25,6 +25,7 @@ from contextlib import contextmanager, nullcontext from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Union +from multiprocessing import Manager import numpy as np import numpy.typing as npt @@ -77,6 +78,10 @@ from vllm_ascend.utils import ProfileExecuteDuration from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer +from vllm_ascend.eplb.eplb_updator import EplbUpdator +from vllm_ascend.eplb.adaptor.vllm_adaptor import VllmEplbAdaptor +from vllm_ascend.eplb.core.loader.device_transfer_loader import D2DExpertWeightLoader + if TYPE_CHECKING: import xgrammar as xgr # type: ignore[import-untyped] from vllm.v1.core.sched.output import SchedulerOutput @@ -363,6 +368,20 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.dp_size = vllm_config.parallel_config.data_parallel_size self.dp_rank = vllm_config.parallel_config.data_parallel_rank + #EPLB + self.enable_eplb = True + # if additional_config: + # self.enable_torchair_graph_mode = additional_config.get( + # "enable_graph_mode", + # False) and self.vllm_config.model_config.use_mla + # self.use_cached_npu_graph = additional_config.get( + # "use_cached_npu_graph", False) + # self.enable_eplb = additional_config.get("enable_eplb", False) + + if self.enable_eplb == True: + self.eplb_adaptor = None + self.eplb_updator = EplbUpdator() + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler output. @@ -1205,6 +1224,13 @@ def execute_model( scheduler_output: "SchedulerOutput", intermediate_tensors: Optional[IntermediateTensors] = None, ) -> Union[ModelRunnerOutput, torch.Tensor]: + + if self.enable_eplb: + if self.eplb_adaptor == None: + self.eplb_adaptor = VllmEplbAdaptor(model=self.model) + self.eplb_updator.set_adaptor(self.eplb_adaptor) + self.eplb_updator.forward_before() + with ProfileExecuteDuration().capture_async( "prepare input and forward"): self._update_states(scheduler_output) @@ -1320,6 +1346,9 @@ def execute_model( logger.info("Profile execute duration [%s]:%s", captured_name, " ".join(dr_str)) + if self.enable_eplb: + self.eplb_updator.forward_end() + return model_runner_output def _profile_multimodal(self) -> None: From aa619f4df40dd8a34fe2176d3234cb29dd7ff9ac Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Tue, 10 Jun 2025 22:03:52 +0800 Subject: [PATCH 045/153] delete invalida import --- vllm_ascend/worker/model_runner_v1.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 671bd3e023..4f2908ac6b 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -25,7 +25,6 @@ from contextlib import contextmanager, nullcontext from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Union -from multiprocessing import Manager import numpy as np import numpy.typing as npt From 6fc343cd4dd3b3dc93d2196bfef4829d37caf50d Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Wed, 11 Jun 2025 17:23:48 +0800 Subject: [PATCH 046/153] fix bugs in fused_experts_with_all2all --- vllm_ascend/eplb/core/worker/eplb_worker.py | 2 +- vllm_ascend/quantization/w8a8_dynamic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 581a509c96..30e3919720 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -101,7 +101,7 @@ def compose_expert_update_info(self, updated_expert_maps, current_expert_maps): if dst_rank_id not in expert_recv_info_this_layer: expert_recv_info_this_layer[dst_rank_id] = [] - if not torch.isin(src_rank_indices, torch.tensor(expert_id)).any(): + if not torch.isin(torch.tensor(expert_id), experts_to_send).any(): # if expert_id are not sent out from any npu, it will be copied from one npu holding this expert candidate_src_rank_indices = torch.where(current_expert_maps_this_layer[:, expert_id] != -1) else: diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 351f9b9c98..cdbf95f02d 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -233,7 +233,7 @@ def fused_experts_with_all2all( log2phy: torch.Tensor = None, global_redundant_expert_num: int = 0, ): - if log2phy: + if log2phy is not None: topk_ids = log2phy[topk_ids] original_shape = hidden_states.shape if len(original_shape) == 3: From b97e0664a2cf9a55735ecfbe70616373c6191bce Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Wed, 11 Jun 2025 20:19:40 +0800 Subject: [PATCH 047/153] Adapt static EPLB --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 51 ++++++++++++++++++- .../core/loader/device_transfer_loader.py | 11 +++- vllm_ascend/eplb/eplb_updator.py | 2 + vllm_ascend/models/deepseek_v2.py | 14 ++--- vllm_ascend/ops/fused_moe.py | 12 ++++- 5 files changed, 77 insertions(+), 13 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 531ff205eb..b4afdfdfee 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -43,6 +43,11 @@ def __init__(self, model, **args): self.expert_map_per_layer[self.num_dense_layers + layer_idx] =\ self.model.get_expert_map(self.num_dense_layers + layer_idx) + self.log2phy_map_per_layer = dict() + for layer_idx in range(self.num_moe_layers): + self.log2phy_map_per_layer[self.num_dense_layers + layer_idx] =\ + self.model.get_log2phy_map(self.num_dense_layers + layer_idx) + def init_buffer_tensor_dict(self, num_buffer_tensor): for name in self.expert_weight_names: complete_name = "model.layers." + str(self.num_dense_layers) + ".mlp.experts." + name @@ -88,4 +93,48 @@ def do_update_expert_weight(self, layer_id, expert_id_before_replace, buffer_ten complete_name = "model.layers." + str(layer_id) + ".mlp.experts." + name local_expert_id = self.expert_map_per_layer[layer_id][expert_id_before_replace].item() expert_tensor = self.param_dict[complete_name].data[local_expert_id] - expert_tensor.copy_(self.buffer_tensor_dict[name][buffer_tensor_id]) \ No newline at end of file + expert_tensor.copy_(self.buffer_tensor_dict[name][buffer_tensor_id]) + + def generate_index_dicts(self,tensor_2d): + dict_list = [] + current_idx = 0 + + for row in tensor_2d: + value_to_index = {} + for i in range(row.size(0)): + value = row[i].item() + value_to_index[value] = current_idx + i + dict_list.append(value_to_index) + current_idx += row.size(0) + + return dict_list + + def generate_log2phy_map(self, expert_map): + ranks_num, global_expert_num = expert_map.shape + concatenated = torch.flatten(expert_map) + rank_expert_to_global = self.generate_index_dicts( + expert_map) + result_dict: Dict[int, List[int]] = {} + for idx, value in enumerate(concatenated): + key = value.item() + if key not in result_dict: + result_dict[key] = [] + result_dict[key].append(idx) + + log2phy_map = torch.full((ranks_num, global_expert_num), + -1, + dtype=torch.int32) + for rank in range(ranks_num): + for key in result_dict: + indices_in_concat = result_dict[key] + if key in rank_expert_to_global[rank]: + log2phy_map[rank][key] = rank_expert_to_global[rank][key] + else: + chosen_index = random.choice(indices_in_concat) + log2phy_map[rank][key] = chosen_index + return log2phy_map + + def do_update_log2phy_map(self, layer_id, updated_log2phy_map): + + if self.log2phy_map_per_layer[layer_id] is not None: + self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map) \ No newline at end of file diff --git a/vllm_ascend/eplb/core/loader/device_transfer_loader.py b/vllm_ascend/eplb/core/loader/device_transfer_loader.py index ea48798670..17cc1500e3 100644 --- a/vllm_ascend/eplb/core/loader/device_transfer_loader.py +++ b/vllm_ascend/eplb/core/loader/device_transfer_loader.py @@ -33,6 +33,7 @@ def __init__(self, eplb_adaptor): self.eplb_adaptor = eplb_adaptor self.updated_expert_map = None + self.updated_log2phy_map = None self.layer_id = -1 # layer id to be updated self.state = ExpertWeightUpdateState.WAITING self.recv_expert_list = [] @@ -68,6 +69,9 @@ def generate_expert_d2d_transfer_task(self, expert_send_info, expert_recv_info, self.state = ExpertWeightUpdateState.READY + def generate_log2phy_map(self,expert_map): + self.updated_log2phy_map = self.eplb_adaptor.generate_log2phy_map(expert_map) + def asyn_expert_weight_transfer(self, reqs): # Only when send/recv tasks are parsed into self.comm_op_list, d2d send/recv tasks can be luanched if self.state != ExpertWeightUpdateState.READY: @@ -94,6 +98,9 @@ def update_expert_map_and_weight(self, reqs): # update expert_map self.eplb_adaptor.do_update_expert_map(self.layer_id, self.updated_expert_map) + #update log2phy_map + self.eplb_adaptor.do_update_log2phy_map(self.layer_id, self.updated_log2phy_map) + # update expert weight buffer_tensor_id = 0 for recv_expert_info in self.recv_expert_list: @@ -138,4 +145,6 @@ def generate_mock_update_info(self, rank_id): return (expert_send_info, expert_recv_info, updated_expert_map, layer_id) def load_impl(self, old_expert_table, new_expert_table): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError + + \ No newline at end of file diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 99cd42255b..5cc4d57bd7 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -89,9 +89,11 @@ def forward_before(self): if self.update_in_flight and self.weight_update_counter < self.num_moe_layers: (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.block_update_queue.get() rank_id = torch.distributed.get_rank() + self.eplb_loader.generate_log2phy_map(updated_expert_map) expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] # TODO: layer_id + 3 should be replaced by configuration + logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") self.eplb_loader.generate_expert_d2d_transfer_task(expert_send_info_this_rank, expert_recv_info_this_rank, updated_expert_map[rank_id], layer_id + 3) self.weight_update_counter += 1 diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 3bd92d5bc9..b032dac6a3 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -796,18 +796,12 @@ def forward( inputs_embeds) return hidden_states - def update_expert_map(self,new_expert_map,layer_id): - self.model.layers[layer_id].mlp.experts.update_map(new_expert_map) - - def update_all_expert_map(self, new_expert_map,num_moe_layers): - num_moe_layers = len(new_expert_map) - for layer_id in range(num_moe_layers): - layer_map = new_expert_map[layer_id].to("npu") - self.model.layers[3+layer_id].mlp.experts.update_map(layer_map) - def get_expert_map(self,layer_id): return self.model.layers[layer_id].mlp.experts.get_map() + def get_log2phy_map(self,layer_id): + return self.model.layers[layer_id].mlp.experts.get_log2phy_map() + def get_all_expert_map(self,num_moe_layers): all_loads = [] for layer_id in range(num_moe_layers): @@ -825,7 +819,7 @@ def get_all_moe_loads(self, num_moe_layers) -> torch.Tensor: """ all_loads = [] for layer_id in range(num_moe_layers): - load_tensor = self.get_moe_load(3+layer_id) # (num_experts_per_layer,) + load_tensor = self.get_moe_load(3+layer_id) all_loads.append(load_tensor) return torch.stack(all_loads, dim=0) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 31b9bca2ab..6d10983f3b 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1047,6 +1047,11 @@ def __init__( self.log2phy = None self.global_redundant_expert_num = 0 + # test log2phy + self.log2phy = torch.full((self.ep_size, self.global_num_experts), + -1, + dtype=torch.int32) + ascend_config = get_ascend_config() expert_map_path = ascend_config.expert_map_path if expert_map_path and os.path.exists(expert_map_path): @@ -1175,6 +1180,8 @@ def forward(self, return e_hidden_states, shared_experts(hidden_states) else: return e_hidden_states + print("log2phy:", self.log2phy) + # print("expert_map:", self.expert_map) self.calculate_moe_load() # if self.enable_multistream_shared_expert and not is_prefill: @@ -1239,6 +1246,9 @@ def update_map(self,new_expert_map): def get_map(self): return self.expert_map + def get_log2phy_map(self): + return self.log2phy + def get_moe_load(self): return self.moe_load @@ -1253,4 +1263,4 @@ def calculate_moe_load(self): ones = torch.ones_like(ids, dtype=torch.int64, device=ids.device) self.moe_load.scatter_add_(0, ids, ones) - return self.moe_load \ No newline at end of file + return self.moe_load From 474a5c300e7a85f2e81ea0f27312f6f719bc40ad Mon Sep 17 00:00:00 2001 From: NPUDev Date: Wed, 11 Jun 2025 19:38:52 +0800 Subject: [PATCH 048/153] add eplb tabel generator --- vllm_ascend/eplb/tool/generate_map.py | 65 +++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 vllm_ascend/eplb/tool/generate_map.py diff --git a/vllm_ascend/eplb/tool/generate_map.py b/vllm_ascend/eplb/tool/generate_map.py new file mode 100644 index 0000000000..b498e73a06 --- /dev/null +++ b/vllm_ascend/eplb/tool/generate_map.py @@ -0,0 +1,65 @@ +import numpy as np +import json +import argparse + + +def split_and_insert(n, k, m): + ''' + n: expert num + k: card num + m: redundant expert num, make sure m%k==0 + ''' + + A = np.arange(n) + + B = np.random.choice(n, size=m, replace=False) + + groups = np.array_split(A, k) + + for j in range(m // k): + for i in range(k): + groups[i] = np.append(groups[i], B[i + j * k]) + return np.concatenate(groups) + + +def random_generation(n_layer=58, n_expert=256, start_layer_idx=0, device_count=128, n_redundant=128, output_name=""): + expert_data = {} + expert_data["moe_layer_count"] = n_layer + layer_list = [] + for i in range(n_layer): + layer = {"layer_id": start_layer_idx + i, "device_count": device_count} + random_placement = split_and_insert(n_expert, device_count, n_redundant) + device_list = [] + step = random_placement.shape[0] // device_count + for j in range(device_count): + device = {} + device["device_id"] = j + device["device_expert"] = random_placement[j * step: (j + 1) * step].tolist() + device_list.append(device) + layer["device_list"] = device_list + layer_list.append(layer) + + expert_data["layer_list"] = layer_list + json_file_path = output_name + + with open(json_file_path, "w") as f: + json.dump(expert_data, f, indent=4) + + print(f"JSON file generated: {json_file_path}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="python generate_map.py --n_layers 2 --n_experts 256 --card_num 8 --n_redundant 8 --output expert_map.json") + parser.add_argument("--n_layers", type=int, required=True) + parser.add_argument("--n_experts", type=int, required=True) + parser.add_argument("--card_num", type=int, required=True) + parser.add_argument("--n_redundant", type=int, default=0) + parser.add_argument("--output", type=str, default="expert_map.json") + args = parser.parse_args() + + n_layers = args.n_layers + n_experts = args.n_experts + card_num = args.card_num + n_redundant = args.n_redundant + output = args.output + + random_generation(n_layers, n_experts, 0, card_num, n_redundant, output) From 807348fe7e0fa18ca02653dc549a72f4f204b130 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Wed, 11 Jun 2025 20:34:59 +0800 Subject: [PATCH 049/153] add enable_eplb in ascend_config --- vllm_ascend/ascend_config.py | 1 + vllm_ascend/worker/model_runner_v1.py | 10 +--------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 2d3428327f..56cd74d384 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -39,6 +39,7 @@ def __init__(self, vllm_config): self.expert_tensor_parallel_size = int( additional_config.get("expert_tensor_parallel_size", 0)) self.expert_map_path = additional_config.get("expert_map_path", None) + self.enable_eplb = additional_config.get("enable_eplb", False) class TorchairGraphConfig: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 4f2908ac6b..607908faac 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -368,15 +368,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.dp_rank = vllm_config.parallel_config.data_parallel_rank #EPLB - self.enable_eplb = True - # if additional_config: - # self.enable_torchair_graph_mode = additional_config.get( - # "enable_graph_mode", - # False) and self.vllm_config.model_config.use_mla - # self.use_cached_npu_graph = additional_config.get( - # "use_cached_npu_graph", False) - # self.enable_eplb = additional_config.get("enable_eplb", False) - + self.enable_eplb = ascend_config.enable_eplb if self.enable_eplb == True: self.eplb_adaptor = None self.eplb_updator = EplbUpdator() From 85d29c5bafafafea5fa228fa96529ae58666fff3 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Wed, 11 Jun 2025 20:55:57 +0800 Subject: [PATCH 050/153] enable_eplb -> dynamic_eplb --- vllm_ascend/ascend_config.py | 2 +- vllm_ascend/worker/model_runner_v1.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 56cd74d384..63c8f1398b 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -39,7 +39,7 @@ def __init__(self, vllm_config): self.expert_tensor_parallel_size = int( additional_config.get("expert_tensor_parallel_size", 0)) self.expert_map_path = additional_config.get("expert_map_path", None) - self.enable_eplb = additional_config.get("enable_eplb", False) + self.dynamic_eplb = additional_config.get("dynamic_eplb", False) class TorchairGraphConfig: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 607908faac..9c010be90b 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -368,8 +368,8 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.dp_rank = vllm_config.parallel_config.data_parallel_rank #EPLB - self.enable_eplb = ascend_config.enable_eplb - if self.enable_eplb == True: + self.dynamic_eplb = ascend_config.dynamic_eplb + if self.dynamic_eplb == True: self.eplb_adaptor = None self.eplb_updator = EplbUpdator() @@ -1216,7 +1216,7 @@ def execute_model( intermediate_tensors: Optional[IntermediateTensors] = None, ) -> Union[ModelRunnerOutput, torch.Tensor]: - if self.enable_eplb: + if self.dynamic_eplb: if self.eplb_adaptor == None: self.eplb_adaptor = VllmEplbAdaptor(model=self.model) self.eplb_updator.set_adaptor(self.eplb_adaptor) @@ -1337,7 +1337,7 @@ def execute_model( logger.info("Profile execute duration [%s]:%s", captured_name, " ".join(dr_str)) - if self.enable_eplb: + if self.dynamic_eplb: self.eplb_updator.forward_end() return model_runner_output From e4172aac84cb021f1da250c4c3242614410b0085 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Sat, 14 Jun 2025 11:17:33 +0800 Subject: [PATCH 051/153] fix bugs in dynamioc eplb --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 13 +++++---- .../eplb/core/policy/policy_factory.py | 4 +-- vllm_ascend/eplb/core/worker/eplb_worker.py | 29 +++++++++++++++++-- vllm_ascend/eplb/eplb_updator.py | 4 +-- vllm_ascend/ops/fused_moe.py | 8 ++--- 5 files changed, 41 insertions(+), 17 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index b4afdfdfee..0785ac5fe3 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -34,15 +34,16 @@ def __init__(self, model, **args): self.expert_weight_names = ["w13_weight", "w2_weight", "w13_weight_scale", "w13_weight_offset", "w2_weight_scale", "w2_weight_offset"] - self.buffer_tensor_dict = dict() - num_buffer_tensor = 100 # TO DO: provide number of buffer tensor by vllm configuration - self.init_buffer_tensor_dict(num_buffer_tensor) - self.expert_map_per_layer = dict() for layer_idx in range(self.num_moe_layers): self.expert_map_per_layer[self.num_dense_layers + layer_idx] =\ self.model.get_expert_map(self.num_dense_layers + layer_idx) + self.buffer_tensor_dict = dict() + # TODO: here we set number of buffer tensor equal to number of expert in each laryer, which can be improved + num_buffer_tensor = torch.where(self.expert_map_per_layer[self.num_dense_layers] != -1)[0].numel() + self.init_buffer_tensor_dict(num_buffer_tensor) + self.log2phy_map_per_layer = dict() for layer_idx in range(self.num_moe_layers): self.log2phy_map_per_layer[self.num_dense_layers + layer_idx] =\ @@ -123,7 +124,7 @@ def generate_log2phy_map(self, expert_map): log2phy_map = torch.full((ranks_num, global_expert_num), -1, - dtype=torch.int32) + dtype=torch.int32) for rank in range(ranks_num): for key in result_dict: indices_in_concat = result_dict[key] @@ -135,6 +136,6 @@ def generate_log2phy_map(self, expert_map): return log2phy_map def do_update_log2phy_map(self, layer_id, updated_log2phy_map): - + if self.log2phy_map_per_layer[layer_id] is not None: self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map) \ No newline at end of file diff --git a/vllm_ascend/eplb/core/policy/policy_factory.py b/vllm_ascend/eplb/core/policy/policy_factory.py index d06d1ce634..726d66ff86 100644 --- a/vllm_ascend/eplb/core/policy/policy_factory.py +++ b/vllm_ascend/eplb/core/policy/policy_factory.py @@ -1,7 +1,7 @@ # Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. from .eplb_policy import EplbPolicy, DynamicConfig from .mock_load_balance import MockLoadBalance -from .mock_dynamic_ep import DynamicEP +#from .mock_dynamic_ep import DynamicEP @@ -10,6 +10,6 @@ class PolicyFactory: def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy: policy = { 0:MockLoadBalance , # MockLoadBalance - 1:DynamicEP, # DynamicEP + #1:DynamicEP, # When real eplb algorithm is ready, recover this } return policy.get(policy_type, MockLoadBalance)(config) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 30e3919720..9d375832cb 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -60,6 +60,9 @@ def do_update(self): load_info, old_placemet = self.global2local(load_info, self.old_expert_maps, self.num_local_experts) changed, priority, new_placement = self.calculate_rebalance_experts(load_info, old_placemet) + if not torch.is_tensor(new_placement): + new_placement = torch.tensor(new_placement) + self.check_expert_placement(old_placemet, new_placement) new_expert_maps = self.local2global(new_placement) logger.debug(f"[EPLB Process new_map differs, performing D2D") @@ -70,6 +73,28 @@ def do_update(self): return update_info + def check_expert_placement(self, old_placemet, new_placement): + num_layers = old_placemet.shape[0] + num_ranks = old_placemet.shape[1] + + for layer_id in range(num_layers): + for rank_id in range(num_ranks): + new_placement_check = new_placement[layer_id][rank_id] + old_placement_check = old_placement[layer_id][rank_id] + + # check if same logical experts are placed on the same NPU + if new_placement_check.numel() != torch.unique(new_placement_check).numel(): + logger.error(f"Replicated experts are placed on the same NPU, expert placement on layer {layer_id}, rank {rank_id} is invalid") + new_placement[layer_id] = old_placement[layer_id] + break + + # check if there is any experts movement inside one NPU + expert_not_move = torch.isin(new_placement_check, old_placement_check) + if not torch.equal(new_placement[expert_not_move], old_placemet[expert_not_move]): + logger.error(f"There exists expert movement inside NPU, expert placement on layer {layer_id}, rank {rank_id} is invalid") + new_placement[layer_id] = old_placement[layer_id] + break + # TODO: Here only expert weight exchange is considered, need to be extended to cover other weight update cases def compose_expert_update_info(self, updated_expert_maps, current_expert_maps): num_layers = current_expert_maps.shape[0] @@ -231,7 +256,7 @@ def worker_process(self,planner_q,block_update_q): update_info = self.worker.do_update() - print("update_info:", update_info) + logger.info("update_info:", update_info) for (a,b,c,d) in update_info: while True: @@ -240,8 +265,6 @@ def worker_process(self,planner_q,block_update_q): block_update_q.put((a,b,c,d)) break - print("EPLB Process complete") - except Exception as e: logger.warning(f"[EPLB subprocess Exiting due to error: {e}", exc_info=True) break diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 5cc4d57bd7..3ae9060d9c 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -30,12 +30,12 @@ def __init__(self): def set_adaptor(self, adaptor): self.adaptor = adaptor self.eplb_loader = D2DExpertWeightLoader(eplb_adaptor=self.adaptor) + self.num_moe_layers = self.adaptor.num_moe_layers def init_eplb(self): self.num_iterations: torch.int64 = 10 - self.num_moe_layers = 2 self.weight_update_counter = 0 self.expert_map_initialized = False self.update_in_flight = False @@ -92,12 +92,12 @@ def forward_before(self): self.eplb_loader.generate_log2phy_map(updated_expert_map) expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] - # TODO: layer_id + 3 should be replaced by configuration logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") self.eplb_loader.generate_expert_d2d_transfer_task(expert_send_info_this_rank, expert_recv_info_this_rank, updated_expert_map[rank_id], layer_id + 3) self.weight_update_counter += 1 if self.weight_update_counter == self.num_moe_layers: + self.weight_update_counter = 0 self.update_in_flight = False # set asynchronous stream for d2d expert weight update diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 6d10983f3b..c438f73b37 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1047,10 +1047,10 @@ def __init__( self.log2phy = None self.global_redundant_expert_num = 0 - # test log2phy - self.log2phy = torch.full((self.ep_size, self.global_num_experts), - -1, - dtype=torch.int32) + # TODO: if this is not need for dynamic eplb with redundant expert, remove this + # self.log2phy = torch.full((self.ep_size, self.global_num_experts), + # -1, + # dtype=torch.int32) ascend_config = get_ascend_config() expert_map_path = ascend_config.expert_map_path From 264a3a585e120523fb916353738c53d4ed2dad7e Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Sat, 14 Jun 2025 15:27:58 +0800 Subject: [PATCH 052/153] delete print in funsed_moe forward --- vllm_ascend/ops/fused_moe.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index c438f73b37..1a02b7a144 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1180,8 +1180,7 @@ def forward(self, return e_hidden_states, shared_experts(hidden_states) else: return e_hidden_states - print("log2phy:", self.log2phy) - # print("expert_map:", self.expert_map) + self.calculate_moe_load() # if self.enable_multistream_shared_expert and not is_prefill: From a3b5af830709960465496e8fed3473b997d28668 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Sat, 14 Jun 2025 16:59:00 +0800 Subject: [PATCH 053/153] [CI/UT][Graph] Add ut for torchair graph mode (#1103) ### What this PR does / why we need it? Add ut for torchair graph mode on DeepSeekV3 ### How was this patch tested? CI passed with new added test. --------- Signed-off-by: MengqingCao Signed-off-by: Mengqing Cao --- docs/source/user_guide/additional_config.md | 18 ++--- docs/source/user_guide/graph_mode.md | 5 +- tests/conftest.py | 9 ++- tests/multicard/test_torchair_graph_mode.py | 80 +++++++++++++++++++++ 4 files changed, 100 insertions(+), 12 deletions(-) create mode 100644 tests/multicard/test_torchair_graph_mode.py diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index 4608326ecd..778938a579 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -53,7 +53,7 @@ The details of each config option are as follows: | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine| -ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `chunked_prefill_enabled: true` to ascend_scheduler_config as well. +ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well. ### Example @@ -62,18 +62,18 @@ A full example of additional configuration is as follows: ``` { "torchair_graph_config": { - "enabled": true, - "use_cached_graph": true, + "enabled": True, + "use_cached_graph": True, "graph_batch_sizes": [1, 2, 4, 8], - "graph_batch_sizes_init": false, - "enable_multistream_moe": false, - "enable_kv_nz": false + "graph_batch_sizes_init": False, + "enable_multistream_moe": False, + "enable_kv_nz": False }, "ascend_scheduler_config": { - "enabled": true, - "chunked_prefill_enabled": true, + "enabled": True, + "enable_chunked_prefill": True, }, "expert_tensor_parallel_size": 1, - "refresh": false, + "refresh": False, } ``` diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md index 6831809f22..161b89a0de 100644 --- a/docs/source/user_guide/graph_mode.md +++ b/docs/source/user_guide/graph_mode.md @@ -47,14 +47,15 @@ from vllm import LLM os.environ["VLLM_USE_V1"] = 1 -model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True}}) +# TorchAirGraph is only work without chunked-prefill now +model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True,}}) outputs = model.generate("Hello, how are you?") ``` online example: ```shell -vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enabled": true}}' +vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True,}}' ``` You can find more detail about additional config [here](./additional_config.md) diff --git a/tests/conftest.py b/tests/conftest.py index 16bbc8027d..e0d70a19d2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,7 @@ # Adapted from vllm-project/vllm/blob/main/tests/conftest.py # +import contextlib import gc from typing import List, Optional, Tuple, TypeVar, Union @@ -53,11 +54,17 @@ PromptVideoInput = _PromptMultiModalInput[np.ndarray] -def cleanup_dist_env_and_memory(): +def cleanup_dist_env_and_memory(shutdown_ray: bool = False): destroy_model_parallel() destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + if shutdown_ray: + import ray # Lazy import Ray + ray.shutdown() gc.collect() torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() class VllmRunner: diff --git a/tests/multicard/test_torchair_graph_mode.py b/tests/multicard/test_torchair_graph_mode.py new file mode 100644 index 0000000000..d06ec7de22 --- /dev/null +++ b/tests/multicard/test_torchair_graph_mode.py @@ -0,0 +1,80 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +"""Compare the short outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/multicard/test_torchair_graph_mode.py`. +""" +import os + +import pytest + +from tests.conftest import VllmRunner + +os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" + + +@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", + reason="torchair graph is not supported on v0") +def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + + example_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + dtype = "half" + max_tokens = 5 + # torchair is only work without chunked-prefill now + with VllmRunner( + "vllm-ascend/DeepSeek-V3-Pruning", + dtype=dtype, + tensor_parallel_size=4, + distributed_executor_backend="mp", + additional_config={ + "torchair_graph_config": { + "enabled": True, + }, + "ascend_scheduler_config": { + "enabled": True, + }, + "refresh": True, + }, + enforce_eager=False, + ) as vllm_model: + # use greedy sampler to make sure the generated results are fix + vllm_output = vllm_model.generate_greedy(example_prompts, + max_tokens) + # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of + # DeepSeek-V3 with 2 hidden layers, thus the golden results seems + # inaccurate. This will only change if accuracy improves with the + # official weights of DeepSeek-V3. + golden_results = [ + 'Hello, my name is feasibility伸 spazio debtor添', + 'The president of the United States is begg"""\n杭州风和 bestimm', + 'The capital of France is frequentlyশามalinkAllowed', + 'The future of AI is deleting俯احت怎么样了حراف', + ] + + assert len(golden_results) == len(vllm_output) + for i in range(len(vllm_output)): + assert golden_results[i] == vllm_output[i][1] + print(f"Generated text: {vllm_output[i][1]!r}") From a5dd04fc377a0309a16513444e8e134381bde660 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Sat, 14 Jun 2025 17:00:59 +0800 Subject: [PATCH 054/153] fix bugs caused by variable name old_placemet --- vllm_ascend/eplb/core/worker/eplb_worker.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 9d375832cb..91f30dbac9 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -57,12 +57,12 @@ def do_update(self): return #根据负载信息,获取更新后的专家表 - load_info, old_placemet = self.global2local(load_info, self.old_expert_maps, self.num_local_experts) - changed, priority, new_placement = self.calculate_rebalance_experts(load_info, old_placemet) + load_info, old_placement = self.global2local(load_info, self.old_expert_maps, self.num_local_experts) + changed, priority, new_placement = self.calculate_rebalance_experts(load_info, old_placement) if not torch.is_tensor(new_placement): new_placement = torch.tensor(new_placement) - self.check_expert_placement(old_placemet, new_placement) + self.check_expert_placement(old_placement, new_placement) new_expert_maps = self.local2global(new_placement) logger.debug(f"[EPLB Process new_map differs, performing D2D") @@ -73,9 +73,9 @@ def do_update(self): return update_info - def check_expert_placement(self, old_placemet, new_placement): - num_layers = old_placemet.shape[0] - num_ranks = old_placemet.shape[1] + def check_expert_placement(self, old_placement, new_placement): + num_layers = old_placement.shape[0] + num_ranks = old_placement.shape[1] for layer_id in range(num_layers): for rank_id in range(num_ranks): @@ -90,7 +90,7 @@ def check_expert_placement(self, old_placemet, new_placement): # check if there is any experts movement inside one NPU expert_not_move = torch.isin(new_placement_check, old_placement_check) - if not torch.equal(new_placement[expert_not_move], old_placemet[expert_not_move]): + if not torch.equal(new_placement[expert_not_move], old_placement[expert_not_move]): logger.error(f"There exists expert movement inside NPU, expert placement on layer {layer_id}, rank {rank_id} is invalid") new_placement[layer_id] = old_placement[layer_id] break From 241b722a57fdd200f05d254412aa9624bc3715ca Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Sat, 14 Jun 2025 17:08:35 +0800 Subject: [PATCH 055/153] move get_init_expert_map to forward_before --- vllm_ascend/eplb/core/worker/eplb_worker.py | 2 +- vllm_ascend/eplb/eplb_updator.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 91f30dbac9..4fbf007426 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -90,7 +90,7 @@ def check_expert_placement(self, old_placement, new_placement): # check if there is any experts movement inside one NPU expert_not_move = torch.isin(new_placement_check, old_placement_check) - if not torch.equal(new_placement[expert_not_move], old_placement[expert_not_move]): + if not torch.equal(new_placement_check[expert_not_move], old_placement_check[expert_not_move]): logger.error(f"There exists expert movement inside NPU, expert placement on layer {layer_id}, rank {rank_id} is invalid") new_placement[layer_id] = old_placement[layer_id] break diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 3ae9060d9c..def5af2257 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -86,6 +86,8 @@ def wakeup_eplb_worker(self): self.planner_block_queue.put(1) def forward_before(self): + self.get_init_expert_map() + if self.update_in_flight and self.weight_update_counter < self.num_moe_layers: (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.block_update_queue.get() rank_id = torch.distributed.get_rank() @@ -105,9 +107,6 @@ def forward_before(self): self.eplb_loader.asyn_expert_weight_transfer(self.reqs) def forward_end(self): - - self.get_init_expert_map() - if not self.update_in_flight and self.get_update_iteration(): moe_load = self.compute_and_set_moe_load() self.wakeup_eplb_worker() From 441508ca9489b3c22ae530b79833dc3d28cf377a Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Sat, 14 Jun 2025 22:26:44 +0800 Subject: [PATCH 056/153] fix bug in log2phy in dynamic w8a8 --- vllm_ascend/quantization/w8a8_dynamic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index cdbf95f02d..901b80f81b 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -120,7 +120,7 @@ def fused_experts_with_mc2( global_redundant_expert_num: int = 0, shared_experts: Optional[Any] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - if log2phy: + if log2phy is not None: topk_ids = log2phy[topk_ids] global_bs = 0 moe_expert_num = len(expert_map) + global_redundant_expert_num From ab5d110fcc35ca11330977450141b1d7176f21e7 Mon Sep 17 00:00:00 2001 From: fems14 <74094523+fems14@users.noreply.github.com> Date: Sat, 14 Jun 2025 22:31:16 +0800 Subject: [PATCH 057/153] vllm-ascend support chunked prefill (#1172) ### What this PR does / why we need it? vllm-ascend support chunked prefill for MLA --------- Signed-off-by: fems14 <1804143737@qq.com> --- docs/source/user_guide/additional_config.md | 1 + tests/singlecard/test_chunked.py | 74 ++++++ vllm_ascend/ascend_config.py | 2 + vllm_ascend/attention/mla_v1.py | 235 ++++++++++++++++++-- vllm_ascend/worker/model_runner_v1.py | 11 +- 5 files changed, 303 insertions(+), 20 deletions(-) create mode 100644 tests/singlecard/test_chunked.py diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index 778938a579..d4756ef5e1 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -31,6 +31,7 @@ The following table lists the additional configuration options available in vLLM | `expert_tensor_parallel_size` | str | `0` | Expert tensor parallel size the model to use. | | `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf case. | | `expert_map_path` | str | None | When using expert load balancing for the MOE model, an expert map path needs to be passed in. | +| `chunked_prefill_for_mla` | bool | `False` | Whether to enable the fused operator-like chunked_prefill. | The details of each config option are as follows: diff --git a/tests/singlecard/test_chunked.py b/tests/singlecard/test_chunked.py new file mode 100644 index 0000000000..2240b88e2c --- /dev/null +++ b/tests/singlecard/test_chunked.py @@ -0,0 +1,74 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Compare the outputs of vLLM with and without aclgraph. + +Run `pytest tests/compile/test_aclgraph.py`. +""" + +import os + +import pytest +import torch +from vllm import LLM, SamplingParams + +MODELS = ["deepseek-ai/DeepSeek-V2-Lite"] + + +@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", + reason="new chunked only support on v1") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [1]) +def test_models( + model: str, + max_tokens: int, + monkeypatch: pytest.MonkeyPatch, +) -> None: + return + with monkeypatch.context() as m: + prompts = "The president of the United States is" + + m.setenv("VLLM_USE_V1", "1") + + sampling_params = SamplingParams( + max_tokens=max_tokens, + temperature=0.0, + ) + + vllm_model = LLM(model, + long_prefill_token_threshold=4, + enforce_eager=True) + output_chunked = vllm_model.generate(prompts, sampling_params) + logprobs_chunked = output_chunked.outputs[0].logprobs + del vllm_model + torch.npu.empty_cache() + + vllm_model = LLM(model, + enforce_eager=True, + additional_config={ + 'ascend_scheduler_config': { + 'enabled': True + }, + }) + output = vllm_model.generate(prompts, sampling_params) + logprobs = output.outputs[0].logprobs + del vllm_model + torch.npu.empty_cache() + + logprobs_similarity = torch.cosine_similarity( + logprobs_chunked.flatten(), logprobs.flatten(), dim=0) + assert logprobs_similarity > 0.95 diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 2d3428327f..defa7fd3d8 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -39,6 +39,8 @@ def __init__(self, vllm_config): self.expert_tensor_parallel_size = int( additional_config.get("expert_tensor_parallel_size", 0)) self.expert_map_path = additional_config.get("expert_map_path", None) + self.chunked_prefill_for_mla = additional_config.get( + "chunked_prefill_for_mla", False) class TorchairGraphConfig: diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index e07d59a309..43cb71c682 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -11,6 +11,7 @@ from vllm.config import get_current_vllm_config from vllm.model_executor.layers.linear import (LinearBase, UnquantizedLinearMethod) +from vllm.utils import cdiv, round_down from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV @@ -69,6 +70,18 @@ def get_impl_cls() -> Type["MLAAttentionImpl"]: @dataclass class AscendMLAPrefillMetadata: """ Prefill Specific Metadata for Ascend""" + + @dataclass + class ChunkedContextMetadata: + # New for MLA (compared to FlashAttention) + # For handling chunked prefill + cu_seq_lens: torch.Tensor + starts: torch.Tensor + seq_tot: list[int] + max_seq_lens: list[int] + workspace: torch.Tensor + chunk_seq_lens: torch.Tensor + attn_mask: torch.Tensor query_lens: list[int] seq_lens: list[int] @@ -78,6 +91,7 @@ class AscendMLAPrefillMetadata: block_table: torch.Tensor max_query_len: int max_seq_lens: int + chunked_context: Optional[ChunkedContextMetadata] = None @dataclass @@ -172,7 +186,32 @@ def __init__(self, if metadata_cls is not None else AscendMLAMetadata # type: ignore self.runner = runner scheduler_config = runner.scheduler_config - self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled + model_config = runner.model_config + self.block_size = runner.block_size + self.chunked_prefill_enabled = runner.chunked_prefill_enabled + if self.chunked_prefill_enabled: + self.chunked_prefill_workspace_size = min( + # Max sure there is enough for 8 full length request or at least + # 4 pages of cache per request + max(8 * model_config.max_model_len, + 4 * scheduler_config.max_num_seqs * self.block_size), + # For long-context models try not to over-allocate limiting + # kv-cache space, limiting it to 64k tokens, + # which would result in the workspace being: + # 2*(576)*(64*1024) = 144mb + # (assuming 576 MLA head dim, and fp16) + # which would result in up-projected context being + # 2*(192*128)*(64*1024) = 3gb + # (assuming 192 QK head dim, 128 heads, and fp16) + 128 * 1024) + assert self.chunked_prefill_workspace_size >= \ + scheduler_config.max_num_seqs * self.block_size + self.chunked_prefill_workspace = torch.empty( + (self.chunked_prefill_workspace_size, + model_config.get_head_size()), + dtype=model_config.dtype, + device=runner.device, + ) ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled @@ -350,6 +389,7 @@ def build( query_start_loc = common_attn_metadata.query_start_loc prefill_metadata = None + chunked_context_metadata = None if self._num_prefills > 0: reqs_start = self._num_decodes # prefill_start tokens_start = self._num_decode_tokens @@ -359,6 +399,41 @@ def build( prefill_query_start_loc = query_start_loc[ reqs_start:] - query_start_loc[reqs_start] + context_lens_cpu = self.runner.input_batch.num_computed_tokens_cpu_tensor[ + reqs_start:num_reqs] + max_context_len_cpu = context_lens_cpu.max().item() + num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item() + if self.chunked_prefill_enabled and max_context_len_cpu > 0: + max_context_chunk = (self.chunked_prefill_workspace_size // + num_prefills_with_context_cpu) + max_context_chunk = round_down(max_context_chunk, + self.block_size) + + assert max_context_chunk > 0 + num_chunks = cdiv(max_context_len_cpu, max_context_chunk) + chunk_starts = torch.arange(num_chunks, dtype=torch.int32) \ + .unsqueeze(1).expand(-1, self._num_prefills) * max_context_chunk + chunk_ends = torch.min(context_lens_cpu.unsqueeze(0), + chunk_starts + max_context_chunk) + chunk_seq_lens = (chunk_ends - chunk_starts).clamp(min=0) + cu_seq_lens_cpu = torch.zeros(num_chunks, + self._num_prefills + 1, + dtype=torch.int32, + pin_memory=True) + torch.cumsum(chunk_seq_lens, + dim=1, + out=cu_seq_lens_cpu[:, 1:], + dtype=torch.int32) + chunked_context_metadata = \ + AscendMLAPrefillMetadata.ChunkedContextMetadata( + cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True), + starts=chunk_starts.to(device, non_blocking=True), + seq_tot=chunk_seq_lens.sum(dim=1).tolist(), + max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(), + chunk_seq_lens=chunk_seq_lens, + workspace=self.chunked_prefill_workspace, + ) + prefill_metadata = AscendMLAPrefillMetadata( attn_mask=self.runner.attn_mask, query_lens=query_lens[tokens_start:], @@ -369,6 +444,7 @@ def build( max_query_len=max_query_len, max_seq_lens=max_seq_lens, query_start_loc=prefill_query_start_loc, + chunked_context=chunked_context_metadata, ) decode_metadata = None @@ -575,6 +651,83 @@ def get_and_maybe_dequant_weights(layer: LinearBase): self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data, 29) self.W_UK_T.data = torch_npu.npu_format_cast(self.W_UK_T.data, 29) + def _compute_prefill_context( + self, + query: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + rope_dim: int, + attn_metadata: AscendMLAMetadata, + prefix_output: torch.Tensor, + prefix_lse: torch.Tensor, + ): + prefill_metadata = attn_metadata.prefill + if prefill_metadata is None or prefill_metadata.chunked_context is None: + return prefix_output, prefix_lse + + iters = len(prefill_metadata.chunked_context.seq_tot) + q_pe = query[..., self.qk_nope_head_dim:] + q_nope = query[..., :self.qk_nope_head_dim] + + seq_len1 = torch.tensor(prefill_metadata.query_lens, dtype=torch.int32) + latent_kv_dim = kv_c_and_k_pe_cache.size(3) - rope_dim + cache_kv_c = kv_c_and_k_pe_cache[:, :, :, :latent_kv_dim] + cache_k_pe = kv_c_and_k_pe_cache[:, :, :, latent_kv_dim:] + for i in range(iters): + toks = prefill_metadata.chunked_context.seq_tot[i] + + seq_len2 = prefill_metadata.chunked_context.chunk_seq_lens[i] + seq_len = torch.stack([seq_len1, seq_len2]) + kv_c_normed = torch.empty(toks, + kv_c_and_k_pe_cache.size(2), + latent_kv_dim, + dtype=query.dtype, + device=query.device) + k_pe = torch.empty(toks, + kv_c_and_k_pe_cache.size(2), + rope_dim, + dtype=query.dtype, + device=query.device) + + torch_npu.atb.npu_paged_cache_load( + cache_kv_c, + cache_k_pe, + prefill_metadata.block_table, + seq_len2.to(query.device), + seq_starts=prefill_metadata.chunked_context.starts[i], + key=kv_c_normed, + value=k_pe, + ) + + kv_c_normed = kv_c_normed.squeeze() + kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \ + -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) + k_nope, v = kv_nope\ + .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) + k_pe = k_pe.expand((*k_nope.shape[:-1], -1)) + mask = torch.triu( + torch.ones(512, 512, device=query.device, dtype=query.dtype), + 1) + torch_npu.atb.npu_ring_mla( + q_nope=q_nope, + q_rope=q_pe, + k_nope=k_nope, + k_rope=k_pe, + value=v, + mask=mask, + seqlen=seq_len, + head_num=self.num_heads, + kv_head_num=self.num_heads, + pre_out=prefix_output, + prev_lse=prefix_lse, + qk_scale=self.scale, + kernel_type="kernel_type_high_precision", + mask_type="no_mask", + input_layout="type_bsnd", + calc_type="calc_type_default", + output=prefix_output, + softmax_lse=prefix_lse) + return prefix_output, prefix_lse + def _forward_prefill( self, query: torch.Tensor, @@ -586,19 +739,29 @@ def _forward_prefill( assert attn_metadata.prefill is not None num_tokens = query.size(0) - attn_output = None + attn_output = torch.empty(num_tokens, + self.num_heads, + self.v_head_dim, + dtype=query.dtype, + device=query.device) + k_nope, value = self.kv_b_proj(kv_c_normed)[0].view( + -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim).split( + [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + k_pe = k_pe.expand((*k_nope.shape[:-1], -1)) # Here is only 2 possibility of input, ChunkedPrefill or PrefillNoCache + ascend_config = get_ascend_config() + if attn_metadata.attn_state in [ AscendAttentionState.ChunkedPrefill, AscendAttentionState.SpecDecoding - ]: - attn_output = torch.empty(num_tokens, - self.num_heads * self.v_head_dim, - dtype=query.dtype, - device=query.device) + ] and not ascend_config.chunked_prefill_for_mla: + attn_output_torch = torch.empty(num_tokens, + self.num_heads * self.v_head_dim, + dtype=query.dtype, + device=query.device) # current requests is chunked in prefill, disable flash attention with chunked prefill vanilla_chunked_prefill_mla( - output=attn_output, + output=attn_output_torch, query=query, kv_cache=kv_c_and_k_pe_cache, block_tables=attn_metadata.prefill.block_table, @@ -613,18 +776,47 @@ def _forward_prefill( scale=self.scale, alibi_slopes=None, causal=True) + elif attn_metadata.attn_state in [ + AscendAttentionState.ChunkedPrefill, + AscendAttentionState.SpecDecoding + ]: + attn_lse = torch.empty(self.num_heads, + num_tokens, + dtype=torch.float32, + device=query.device) + q_pe = query[..., self.qk_nope_head_dim:] + q_nope = query[..., :self.qk_nope_head_dim] + mask = torch.triu( + torch.ones(512, 512, device=query.device, dtype=query.dtype), + 1) # 512: mask only support 512 + if attn_metadata.num_prefills > 1: + mask = mask.unsqueeze(0).repeat(attn_metadata.num_prefills, 1, + 1) + torch_npu.atb.npu_ring_mla( + q_nope=q_nope, + q_rope=q_pe, + k_nope=k_nope, + k_rope=k_pe, + value=value, + mask=mask, + seqlen=torch.tensor(attn_metadata.prefill.query_lens, + dtype=torch.int32), + head_num=self.num_heads, + kv_head_num=self.num_heads, + pre_out=None, + prev_lse=None, + qk_scale=self.scale, + kernel_type="kernel_type_high_precision", + mask_type="mask_type_triu", + input_layout="type_bsnd", + calc_type="calc_type_first_ring", + output=attn_output, + softmax_lse=attn_lse) + attn_output, attn_lse = self._compute_prefill_context( \ + query, kv_c_and_k_pe_cache, self.qk_rope_head_dim, attn_metadata, attn_output, attn_lse) + elif attn_metadata.attn_state == AscendAttentionState.PrefillNoCache: - attn_output = torch.empty(num_tokens, - self.num_heads, - self.v_head_dim, - dtype=query.dtype, - device=query.device) - k_nope, value = self.kv_b_proj(kv_c_normed)[0].view( - -1, self.num_heads, - self.qk_nope_head_dim + self.v_head_dim).split( - [self.qk_nope_head_dim, self.v_head_dim], dim=-1) - key = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), - dim=-1) + key = torch.cat((k_nope, k_pe), dim=-1) torch_npu._npu_flash_attention( query=query, key=key, @@ -642,6 +834,11 @@ def _forward_prefill( ) attn_output = attn_output.reshape( [num_tokens, self.num_heads * self.v_head_dim]) + if attn_metadata.attn_state in [ + AscendAttentionState.ChunkedPrefill, + AscendAttentionState.SpecDecoding + ] and not ascend_config.chunked_prefill_for_mla: + attn_output = attn_output_torch current_ms_metadata = get_multistream_comm_context() if current_ms_metadata is None: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index c3587936f9..6d226da0bf 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -134,7 +134,11 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.lora_config = vllm_config.lora_config self.scheduler_config = vllm_config.scheduler_config self.speculative_config = vllm_config.speculative_config - self.chunked_prefill_enabled = vllm_config.scheduler_config.chunked_prefill_enabled + ascend_config = get_ascend_config() + if ascend_config.ascend_scheduler_config.enabled: + self.chunked_prefill_enabled = self.scheduler_config.chunked_prefill_enabled + else: + self.chunked_prefill_enabled = True self.device = device self.is_multimodal_model = self.model_config.is_multimodal_model @@ -1260,6 +1264,7 @@ def execute_model( # TODO(woosuk): The following loop can be slow since it iterates over # the requests one by one. Optimize. + discard_sampled_tokens_req_indices = [] for i, req_id in enumerate(self.input_batch.req_ids): req_state = self.requests[req_id] seq_len = (req_state.num_computed_tokens + @@ -1270,6 +1275,7 @@ def execute_model( generator = self.input_batch.generators.get(i) if generator is not None: generator.set_offset(generator.get_offset() - 4) + discard_sampled_tokens_req_indices.append(i) # NOTE: NPU -> CPU Sync happens here. # Move as many CPU operations as possible before this sync point. @@ -1290,6 +1296,9 @@ def execute_model( self.input_batch.vocab_size, ) + for i in discard_sampled_tokens_req_indices: + valid_sampled_token_ids[i].clear() + spec_token_ids = self._get_spec_token_ids( valid_sampled_token_ids, sampling_metadata, From 88bb99ddacf74fe50ced9ffa07575980f0a63eab Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Sat, 14 Jun 2025 22:43:22 +0800 Subject: [PATCH 058/153] fix bug for dim of updated_log2phy_map --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 0785ac5fe3..b56468a66c 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -138,4 +138,5 @@ def generate_log2phy_map(self, expert_map): def do_update_log2phy_map(self, layer_id, updated_log2phy_map): if self.log2phy_map_per_layer[layer_id] is not None: - self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map) \ No newline at end of file + rank_id = torch.distributed.get_rank() + self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map[rank_id]) \ No newline at end of file From 0d2074a1ececc76b0ac9f9fd37fab399634658c9 Mon Sep 17 00:00:00 2001 From: 22dimensions Date: Sun, 15 Jun 2025 15:41:11 +0800 Subject: [PATCH 059/153] [Doc] fix VLLM_USE_V1 value in graph mode docs (#1226) os.environ["VLLM_USE_V1"] must be assigned with str, not other type. ![image](https://github.com/user-attachments/assets/9d337ae5-00e5-4179-832e-c6c917dd5798) Signed-off-by: 22dimensions --- docs/source/user_guide/graph_mode.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md index 161b89a0de..38d356249b 100644 --- a/docs/source/user_guide/graph_mode.md +++ b/docs/source/user_guide/graph_mode.md @@ -23,7 +23,7 @@ import os from vllm import LLM -os.environ["VLLM_USE_V1"] = 1 +os.environ["VLLM_USE_V1"] = "1" model = LLM(model="Qwen/Qwen2-7B-Instruct") outputs = model.generate("Hello, how are you?") @@ -45,7 +45,7 @@ offline example: import os from vllm import LLM -os.environ["VLLM_USE_V1"] = 1 +os.environ["VLLM_USE_V1"] = "1" # TorchAirGraph is only work without chunked-prefill now model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True,}}) @@ -70,7 +70,7 @@ offline example: import os from vllm import LLM -os.environ["VLLM_USE_V1"] = 1 +os.environ["VLLM_USE_V1"] = "1" model = LLM(model="someother_model_weight", enforce_eager=True) outputs = model.generate("Hello, how are you?") From 4270682383b4f4876296565bbb1227c1111b1d5e Mon Sep 17 00:00:00 2001 From: ttanzhiqiang <38750855+ttanzhiqiang@users.noreply.github.com> Date: Sun, 15 Jun 2025 19:57:02 +0800 Subject: [PATCH 060/153] Waiting for BMM NZ support(Improve TPOP 2ms performance) (#1131) ### What this PR does / why we need it? W_UV/W_UK_T cannot be converted to nz, because this position will be fused into transposebatchmatmul, which does not support nz. The weights are actually converted back to nd in each run. ### Does this PR introduce _any_ user-facing change? Use #1098 as the baseline, p90 TPOT 90.79ms->88.58ms, improve TPOP 2ms ### How was this patch tested? use #1101 --------- Signed-off-by: ttanzhiqiang <389825161@qq.com> --- vllm_ascend/attention/mla_v1.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 43cb71c682..f741508791 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -648,8 +648,10 @@ def get_and_maybe_dequant_weights(layer: LinearBase): self.W_UV = W_UV.transpose(0, 1).contiguous() # Convert from (L, N, P) to (N, P, L) self.W_UK_T = W_UK.permute(1, 2, 0).contiguous() - self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data, 29) - self.W_UK_T.data = torch_npu.npu_format_cast(self.W_UK_T.data, 29) + + # Waiting for BMM NZ support + # self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data, 29) + # self.W_UK_T.data = torch_npu.npu_format_cast(self.W_UK_T.data, 29) def _compute_prefill_context( self, From 4ce860a2be91e7aa4a301a4c25fe88aa3c71be07 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Sun, 15 Jun 2025 22:07:43 +0800 Subject: [PATCH 061/153] [CI] Make e2e test to be preemptible and simple (#1217) ### What this PR does / why we need it? This PR make e2e test to be simple, even bring some repeat code between single card and multicard, but we will not struggle with across max-parallel, matrix and concurrency: 1. This PR make e2e test to be preemptible and simple: - lint ---> e2e (2 parallel) ---> e2e multi-card (1 parallel) - Anytime you push another PR will cancel previous job, whatever the job is lint / e2e / multi-cards 2. Use Modelscope rather than hf-mirror 3. Resolve some error like `Canceling since a higher priority waiting request for pr-XXXX-limit-npu-4 exists` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed - lint ---> e2e (2 parallel) ---> e2e multi-card (1 parallel) - e2e test will canceled by update patch --------- Signed-off-by: Yikun Jiang --- .github/workflows/vllm_ascend_test.yaml | 183 ++++++++++++++++-------- 1 file changed, 126 insertions(+), 57 deletions(-) diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index f71e83205f..de085e4850 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # -name: 'test' +name: 'e2e test / basic' on: schedule: @@ -44,6 +44,12 @@ defaults: run: shell: bash -el {0} +# only cancel in-progress runs of the same workflow +# and ignore the lint / 1 card / 4 cards test type +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: lint: runs-on: ubuntu-latest @@ -114,25 +120,14 @@ jobs: strategy: max-parallel: 2 matrix: - os: [linux-arm64-npu-1, linux-arm64-npu-4] + os: [linux-arm64-npu-1] vllm_version: [main, v0.9.1] - concurrency: - group: > - ${{ - matrix.os == 'linux-arm64-npu-4' - && github.event.pull_request.number - && format('pr-{0}-limit-npu-4', github.event.pull_request.number) - || format('job-{0}-{1}-{2}', matrix.os, matrix.vllm_version, github.event.pull_request.number) - }} - cancel-in-progress: false name: vLLM Ascend test runs-on: ${{ matrix.os }} container: # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 env: - HF_ENDPOINT: https://hf-mirror.com - HF_TOKEN: ${{ secrets.HF_TOKEN }} VLLM_LOGGING_LEVEL: ERROR steps: - name: Check npu and CANN info @@ -177,61 +172,135 @@ jobs: env: VLLM_USE_V1: 1 VLLM_WORKER_MULTIPROC_METHOD: spawn + VLLM_USE_MODELSCOPE: True + run: | + pytest -sv tests/singlecard/test_offline_inference.py + # TODO: switch hf to modelscope + VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ + pytest -sv tests/singlecard/test_ilama_lora.py + # TODO(sss): guided decoding doesn't work, fix it later + # pytest -sv tests/singlecard/test_guided_decoding.py + # test_ascend_config.py should be ran separately because it will regenerate the global config many times. + pytest -sv tests/singlecard/test_ascend_config.py + pytest -sv tests/singlecard/test_camem.py + pytest -sv tests/singlecard/ \ + --ignore=tests/singlecard/test_offline_inference.py \ + --ignore=tests/singlecard/test_ilama_lora.py \ + --ignore=tests/singlecard/test_guided_decoding.py \ + --ignore=tests/singlecard/test_ascend_config.py \ + --ignore=tests/singlecard/test_camem.py + + - name: Run vllm-project/vllm-ascend test on V0 engine + if: ${{ github.event_name == 'schedule' }} + env: + VLLM_USE_V1: 0 + VLLM_USE_MODELSCOPE: True run: | - if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then - VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py - # guided decoding doesn't work, fix it later - # pytest -sv tests/singlecard/test_guided_decoding.py.py - # test_ascend_config.py should be ran separately because it will regenerate the global config many times. - pytest -sv tests/singlecard/test_ascend_config.py - pytest -sv tests/singlecard/test_camem.py - pytest -sv tests/singlecard/core/test_ascend_scheduler.py - pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py - pytest -sv tests/singlecard/ \ + pytest -sv tests/singlecard/test_offline_inference.py + # TODO: switch hf to modelscope + VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ + pytest -sv tests/singlecard/test_ilama_lora.py + # guided decoding doesn't work, fix it later + # pytest -sv tests/singlecard/test_guided_decoding.py + pytest -sv tests/singlecard/test_camem.py + # test_ascend_config.py should be ran separately because it will regenerate the global config many times. + pytest -sv tests/singlecard/test_ascend_config.py + pytest -sv tests/singlecard/test_prompt_embedding.py + pytest -sv tests/singlecard/ \ --ignore=tests/singlecard/test_offline_inference.py \ + --ignore=tests/singlecard/test_ilama_lora.py \ --ignore=tests/singlecard/test_guided_decoding.py \ - --ignore=tests/singlecard/test_ascend_config.py \ --ignore=tests/singlecard/test_camem.py \ + --ignore=tests/singlecard/test_ascend_config.py \ + --ignore=tests/singlecard/test_prompt_embedding.py \ --ignore=tests/singlecard/core/test_ascend_scheduler.py \ --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py - else + + e2e-4-cards: + needs: [e2e] + if: ${{ needs.e2e.result == 'success' }} + strategy: + max-parallel: 1 + matrix: + os: [linux-arm64-npu-4] + vllm_version: [main, v0.9.1] + name: vLLM Ascend test + runs-on: ${{ matrix.os }} + container: + # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready + image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 + env: + VLLM_LOGGING_LEVEL: ERROR + VLLM_USE_MODELSCOPE: True + steps: + - name: Check npu and CANN info + run: | + npu-smi info + cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info + + - name: Config mirrors + run: | + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + apt-get update -y + apt install git -y + git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ + + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v4 + + - name: Install system dependencies + run: | + apt-get -y install `cat packages.txt` + apt-get -y install gcc g++ cmake libnuma-dev + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + ref: ${{ matrix.vllm_version }} + path: ./vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: ./vllm-empty + run: | + VLLM_TARGET_DEVICE=empty pip install -e . + + - name: Install vllm-project/vllm-ascend + run: | + pip install -r requirements-dev.txt + pip install -v -e . + + - name: Run vllm-project/vllm-ascend test for V1 Engine + env: + VLLM_USE_V1: 1 + VLLM_WORKER_MULTIPROC_METHOD: spawn + VLLM_USE_MODELSCOPE: True + run: | + # TODO: switch hf to modelscope + VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ pytest -sv tests/multicard/test_ilama_lora_tp2.py - # To avoid oom, we need to run the test in a single process. - VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek - VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk - VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 - VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py - fi + # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error. + # To avoid oom, we need to run the test in a single process. + pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ + pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek + pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 + pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py - name: Run vllm-project/vllm-ascend test on V0 engine if: ${{ github.event_name == 'schedule' }} env: VLLM_USE_V1: 0 + VLLM_USE_MODELSCOPE: True run: | - if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then - VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py - # guided decoding doesn't work, fix it later - # pytest -sv tests/singlecard/test_guided_decoding.py.py - pytest -sv tests/singlecard/test_camem.py - # test_ascend_config.py should be ran separately because it will regenerate the global config many times. - pytest -sv tests/singlecard/test_ascend_config.py - pytest -sv tests/singlecard/test_prompt_embedding.py - pytest -sv tests/singlecard/ \ - --ignore=tests/singlecard/test_offline_inference.py \ - --ignore=tests/singlecard/test_guided_decoding.py \ - --ignore=tests/singlecard/test_camem.py \ - --ignore=tests/singlecard/test_ascend_config.py \ - --ignore=tests/singlecard/test_prompt_embedding.py \ - --ignore=tests/singlecard/core/test_ascend_scheduler.py \ - --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py - else + # TODO: switch hf to modelscope + VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ pytest -sv tests/multicard/test_ilama_lora_tp2.py - # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error. - # To avoid oom, we need to run the test in a single process. - VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek - VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk - VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 - VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py - fi + # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error. + # To avoid oom, we need to run the test in a single process. + pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ + pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek + pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 + pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py From 966557a2a3d83704a115f1465017d4f0c94af6cc Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Mon, 16 Jun 2025 09:02:53 +0800 Subject: [PATCH 062/153] [Build] Speedup image build (#1216) ### What this PR does / why we need it? 1. Rename workflow name to show OS info 2. Speedup image build: - PR: only arm64 build on openEuler arm64, only amd64 build on Ubuntu amd64 - Push/Tag: still keep origin logic use qemu on amd64 This PR actually drop the e2e image build per PR but I think it's fine consider it's stable enough, if we still meet some problem we can revert this PR 43-44mins ---> about 8-10 mins ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed Signed-off-by: Yikun Jiang --- .github/workflows/image_openeuler.yml | 19 ++++++++++++++----- .github/workflows/image_ubuntu.yml | 11 ++++++++--- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/.github/workflows/image_openeuler.yml b/.github/workflows/image_openeuler.yml index 2421aa4b11..0d585e690f 100644 --- a/.github/workflows/image_openeuler.yml +++ b/.github/workflows/image_openeuler.yml @@ -1,4 +1,4 @@ -name: 'image' +name: 'image / openEuler' # This is a docker build check and publish job: # 1. PR Triggered docker image build check # - is for image build check @@ -33,9 +33,13 @@ on: jobs: build: - name: vllm-ascend openEuler image - runs-on: ubuntu-latest - + name: vllm-ascend image build + runs-on: >- + ${{ + github.event_name == 'push' && github.repository_owner == 'vllm-project' && + 'ubuntu-latest' || + 'ubuntu-24.04-arm' + }} steps: - uses: actions/checkout@v4 @@ -87,7 +91,12 @@ jobs: - name: Build and push uses: docker/build-push-action@v6 with: - platforms: linux/amd64,linux/arm64 + platforms: >- + ${{ + github.event_name == 'push' && github.repository_owner == 'vllm-project' && + 'linux/amd64,linux/arm64' || + 'linux/arm64' + }} # use the current repo path as the build context, ensure .git is contained context: . # only trigger when tag, branch/main push diff --git a/.github/workflows/image_ubuntu.yml b/.github/workflows/image_ubuntu.yml index 5cdf076839..63dadc64a7 100644 --- a/.github/workflows/image_ubuntu.yml +++ b/.github/workflows/image_ubuntu.yml @@ -1,4 +1,4 @@ -name: 'image' +name: 'image / Ubuntu' # This is a docker build check and publish job: # 1. PR Triggered docker image build check # - is for image build check @@ -33,7 +33,7 @@ on: jobs: build: - name: vllm-ascend Ubuntu image + name: vllm-ascend image build runs-on: ubuntu-latest steps: @@ -87,7 +87,12 @@ jobs: - name: Build and push uses: docker/build-push-action@v6 with: - platforms: linux/amd64,linux/arm64 + platforms: >- + ${{ + github.event_name == 'push' && github.repository_owner == 'vllm-project' && + 'linux/amd64,linux/arm64' || + 'linux/amd64' + }} # use the current repo path as the build context, ensure .git is contained context: . # only trigger when tag, branch/main push From e86964e87ed13a39312f827a0628c4862d850bf3 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Mon, 16 Jun 2025 10:15:53 +0800 Subject: [PATCH 063/153] add dynamic_ep alg. --- vllm_ascend/eplb/core/policy/dynamic_ep.py | 337 ++++++++++++++++++ .../eplb/core/policy/mock_load_balance.py | 3 +- .../eplb/core/policy/policy_factory.py | 4 +- vllm_ascend/eplb/core/worker/eplb_worker.py | 284 +++++++++++++++ 4 files changed, 625 insertions(+), 3 deletions(-) create mode 100644 vllm_ascend/eplb/core/policy/dynamic_ep.py diff --git a/vllm_ascend/eplb/core/policy/dynamic_ep.py b/vllm_ascend/eplb/core/policy/dynamic_ep.py new file mode 100644 index 0000000000..9023e09d43 --- /dev/null +++ b/vllm_ascend/eplb/core/policy/dynamic_ep.py @@ -0,0 +1,337 @@ +# Copyright Huawei Technologies Co., Ltd. 2024-2025. All rights reserved. +from collections import defaultdict +import numpy as np + +from .eplb_policy import EplbPolicy, DynamicConfig + + +class DynamicTable: + # workload_table: + # 三维矩阵,[layer, gpus, experts_per_gpu_per_layer] -> value: 所在位置的热度 + # 大小为 层数 * 卡数 * 每层每卡的专家数量 + # 里面i, j, k的元素代表 第 i 层 第 j 张卡第 k 个专家的热度 + # 对于收集不到的专家,填为 -1 + workload_table = None + + # placement_table: + # 三维矩阵,[layer, gpus, experts_per_gpu_per_layer] -> value: 所在位置的物理专家id + # 大小为 层数 * 卡数 * 每层每卡的专家数量 + # 里面i, j, k的元素代表 第 i 层 第 j 张卡第 k 个专家的物理id + # 对于收集不到的专家,填为 -1 + placement_table = None + + +class DynamicEP(EplbPolicy): + + def __init__(self, config: DynamicConfig): + super().__init__(config) + + @staticmethod + def add_redundant(current_expert_table, expert_workload, num_original_expert): + layer_num, npu_num, experts_per_npu = expert_workload.shape + workload_new = np.zeros((layer_num, num_original_expert)) + for layer_idx in range(layer_num): + workload_dict = defaultdict(int) + placement_layer = current_expert_table[layer_idx].copy() + workload_layer = expert_workload[layer_idx].copy() + for npu_idx in range(npu_num): + for expert_idx in range(experts_per_npu): + workload_dict[placement_layer[npu_idx][expert_idx]] += workload_layer[npu_idx][expert_idx] + for expert_idx in range(num_original_expert): + workload_new[layer_idx][expert_idx] = workload_dict[expert_idx] + return workload_new + + @staticmethod + # 热点专家拆分为冗余专家 + def original_compute_balanced_pack_redundancy(origin_weights, card_num, num_redundancy_expert): + # Step 1: Sort the items by weight in descending order (we are sorting by weight now) + # Sort based on the second element (the second value of each tuple) + route_expert_num = len(origin_weights) + route_expert_redundancy = [[] for _ in range(route_expert_num)] + for i in range(num_redundancy_expert): + sorted_indices = np.argsort([t[1] for t in origin_weights], kind='stable')[::-1] + weights = [origin_weights[idx] for idx in sorted_indices] + tmp_raw_weight = weights[0][1] * (len(route_expert_redundancy[weights[0][0]]) + 1) + route_expert_redundancy[weights[0][0]].append(route_expert_num + i) + avg_weight = tmp_raw_weight / (len(route_expert_redundancy[weights[0][0]]) + 1) + weights[0] = (weights[0][0], avg_weight) + origin_weights = weights + + # Step 2: Calculate the number of items per box + expert_num = route_expert_num + num_redundancy_expert + items_per_box = expert_num // card_num # Number of items per box + remaining_items = expert_num % card_num # Number of items per box + + # Step 3: Initialize card_num boxes with empty lists to store item IDs + boxes = [[] for _ in range(card_num)] + boxes_weights = [[] for _ in range(card_num)] + box_weights = [0] * card_num # To store the total weight of each box + box_counts = [0] * card_num # To store the number of items in each box + index = 0 + for i in range(route_expert_num): + redundancy_num = len(route_expert_redundancy[i]) + for _ in range(redundancy_num): + cur_weight = 0 + for item, weight in origin_weights: + if item == i: + cur_weight = weight + + boxes[index].append(i) + boxes_weights[index].append(cur_weight) + box_weights[index] += cur_weight + box_counts[index] += 1 + index += 1 + + sorted_indices = np.argsort([t[1] for t in origin_weights], kind='stable')[::-1] + origin_weights = [origin_weights[idx] for idx in sorted_indices] + # Step 4: Distribute items into boxes based on weight + for item_id, weight in origin_weights: + # Find the box with the least items but not full + min_box_index = -1 + for i in range(card_num): + if item_id in boxes[i]: + continue + # Only choose boxes that still have space (box_counts[i] < items_per_box) + if box_counts[i] < items_per_box or (box_counts[i] == items_per_box and remaining_items > 0): + if min_box_index == -1 or box_weights[i] < box_weights[min_box_index]: + min_box_index = i + + # Place the item (id) into the selected box + boxes[min_box_index].append(item_id) + boxes_weights[min_box_index].append(weight) + box_weights[min_box_index] += weight + box_counts[min_box_index] += 1 + + # If there's an imbalance in the remaining items, reduce the "remaining_items" counter + if box_counts[min_box_index] == (items_per_box + 1) and remaining_items > 0: + remaining_items -= 1 + + # Step 5: Output each box's contents and total weight + result = [] + for i in range(card_num): + result.append({ + "box_index": i + 1, + "items": boxes[i], # List of item IDs in the box + "weight": boxes_weights[i], + "total_weight": box_weights[i], # Total weight in this box + "item_count": box_counts[i] # Number of items in the box + }) + + return result, boxes + + # 热点专家拆分为冗余专家 + @staticmethod + def compute_balanced_pack_redundancy(origin_weights, card_num, num_redundancy_expert): + route_expert_num = len(origin_weights) + route_expert_redundancy = [[] for _ in range(route_expert_num)] + for i in range(num_redundancy_expert): + sorted_indices = np.argsort([t[1] for t in origin_weights], kind='stable')[::-1] + weights = [origin_weights[idx] for idx in sorted_indices] + tmp_raw_weight = weights[0][1] * (len(route_expert_redundancy[weights[0][0]]) + 1) + route_expert_redundancy[weights[0][0]].append(route_expert_num + i) + avg_weight = tmp_raw_weight / (len(route_expert_redundancy[weights[0][0]]) + 1) + weights[0] = (weights[0][0], avg_weight) + origin_weights = weights + + expert_num = route_expert_num + num_redundancy_expert + if card_num == 0: + raise RuntimeError("card_num can not be 0.") + items_per_box = expert_num // card_num + remaining_items = expert_num % card_num + + boxes = [[] for _ in range(card_num)] + boxes_weights = [[] for _ in range(card_num)] + box_weights = [0] * card_num + box_counts = [0] * card_num + + all_weights = np.zeros((expert_num,), dtype='object') + all_weights[: route_expert_num] = origin_weights + + index = route_expert_num + for i in range(route_expert_num): + redundancy_num = len(route_expert_redundancy[i]) + for _ in range(redundancy_num): + for item, weight in origin_weights: + if item == i: + all_weights[index] = (item, weight) + index += 1 + + sorted_indices = np.argsort([t[1] for t in all_weights], kind='stable')[::-1] + all_weights = [all_weights[idx] for idx in sorted_indices] + for item_id, weight in all_weights: + min_box_index = -1 + for i in range(card_num): + if box_counts[i] < items_per_box or (box_counts[i] == items_per_box and remaining_items > 0): + if min_box_index == -1 or box_weights[i] < box_weights[min_box_index]: + if item_id not in boxes[i]: + min_box_index = i + + boxes[min_box_index].append(item_id) + boxes_weights[min_box_index].append(weight) + box_weights[min_box_index] += weight + box_counts[min_box_index] += 1 + + if box_counts[min_box_index] == (items_per_box + 1) and remaining_items > 0: + remaining_items -= 1 + + result = [] + for i in range(card_num): + result.append({ + "box_index": i + 1, + "items": boxes[i], + "weight": boxes_weights[i], + "total_weight": box_weights[i], + "item_count": box_counts[i] + }) + + return result, boxes + + # 无冗余专家方案 + @staticmethod + def compute_balanced_pack(origin_weights, card_num): + sorted_indices = np.argsort([t[1] for t in origin_weights])[::-1] + weights = origin_weights[sorted_indices] + expert_num = len(weights) + if card_num == 0: + raise RuntimeError("card_num can not be 0.") + items_per_box = expert_num // card_num + remaining_items = expert_num % card_num + + boxes = [[] for _ in range(card_num)] + boxes_weights = [[] for _ in range(card_num)] + box_weights = [0] * card_num + box_counts = [0] * card_num + + for item_id, weight in weights: + min_box_index = -1 + for i in range(card_num): + if box_counts[i] < items_per_box or (box_counts[i] == items_per_box and remaining_items > 0): + if min_box_index == -1 or box_weights[i] < box_weights[min_box_index]: + min_box_index = i + + boxes[min_box_index].append(item_id) + boxes_weights[min_box_index].append(weight) + box_weights[min_box_index] += weight + box_counts[min_box_index] += 1 + + if box_counts[min_box_index] == (items_per_box + 1) and remaining_items > 0: + remaining_items -= 1 + + result = [] + for i in range(card_num): + result.append({ + "box_index": i + 1, + "items": boxes[i], + "weight": boxes_weights[i], + "total_weight": box_weights[i], + "item_count": box_counts[i] + }) + + return result, boxes + + @staticmethod + def get_redundant_num(npu_num, counts): + redundant_num_each_npu = np.sum(counts - 1) + return redundant_num_each_npu + + @staticmethod + def calculate_max_heat_per_layer(workload_table, layer_num): + max_heat_per_layer = [] + for layer_idx in range(layer_num): + npu_heats_now = np.sum(workload_table[layer_idx], axis=1) + max_heat_per_layer.append(np.max(npu_heats_now)) + return max_heat_per_layer + + @staticmethod + def constraint_expert_local_exchange(current_expert_table, global_deployment): + for layer_id in range(len(global_deployment)): + for card_id in range(len(global_deployment[layer_id])): + current_list = [int(x) for x in current_expert_table[layer_id][card_id]] + new_list = [int(x) for x in global_deployment[layer_id][card_id]] + num = len(new_list) + + new_index = [-1] * num + new_result = [-1] * num + remaining_elements = [] + + for i in range(num): + flag = True + for j in range(num): + if new_list[i] == current_list[j] and new_index[j] == -1: + new_index[j] = 0 + new_result[j] = current_list[j] + flag = False + break + if flag: + remaining_elements.append(new_list[i]) + + index = 0 + for k in range(num): + if new_result[k] == -1: + new_result[k] = remaining_elements[index] + index += 1 + + global_deployment[layer_id][card_id] = new_result + + return global_deployment + + + def rebalance_experts(self, current_expert_table, expert_workload): + + info = DynamicTable() + info.workload_table = np.array(expert_workload) + info.placement_table = np.array(current_expert_table) + layer_num, num_npus, experts_per_npu= info.workload_table.shape + expert_ids, counts = np.unique(info.placement_table[0], return_counts=True) + num_redundancy_expert = self.get_redundant_num(num_npus, counts) + num_original_expert = len(expert_ids) + layer_workloads = self.add_redundant(info.placement_table, info.workload_table, num_original_expert) + max_heat_per_layer_before = self.calculate_max_heat_per_layer(info.workload_table, layer_num) + npu_heat_all_origin = sum(max_heat_per_layer_before) + + # 计算负载均衡,部署冗余专家 + layer_num = layer_workloads.shape[0] + expert_num = layer_workloads.shape[1] + # 校验专家数量、卡数量、冗余专家数量不能超过卡数量 + if num_original_expert != expert_num: + raise ValueError(f"原始专家数量 {num_original_expert} 必须等于 expert_num {expert_num}") + + if num_npus <= 0: + raise ValueError("NPUs 数量必须大于 0") + + if num_npus < num_redundancy_expert: + raise ValueError(f"NPUs 数量 {num_npus} 必须大于或等于冗余专家数量 {num_redundancy_expert}") + + # 每个卡部署的专家数量 一个冗余专家 + global_deployment = [[[] for _ in range(num_npus)] for _ in range(layer_num)] + # 遍历获得每一层的放置策略,考虑计算均衡 + max_heat_per_layer_after = np.zeros([layer_num]) + for layer in range(layer_num): + # 获取当前层专家ID和对应负载,负载需要进行正则化处理, 每个卡加一个冗余专家 + weights = np.zeros((expert_num,), dtype='object') + for expert_id, workload_weight in enumerate(layer_workloads[layer]): + weights[expert_id] = (expert_id, workload_weight) + + # 获取每一层全局计算均衡的放置策略 + result, layer_deployment = self.original_compute_balanced_pack_redundancy( + weights, num_npus, num_redundancy_expert + ) + + global_deployment[layer] = layer_deployment + max_heat_per_layer_after[layer] = max(result, key=lambda x: x['total_weight'])['total_weight'] + + new_global_deployment = self.constraint_expert_local_exchange(current_expert_table, global_deployment) + # 获取层优先级 + layer_changed_ratio = [] + for layer_idx in range(layer_num): + layer_changed_ratio.append(max_heat_per_layer_after[layer_idx] / max_heat_per_layer_before[layer_idx]) + + per_layer_priority = np.argsort(layer_changed_ratio) + npu_heat_all_after = sum(max_heat_per_layer_after) + + change = 0 + if npu_heat_all_after < 0.95 * npu_heat_all_origin: + change = 1 + + return change, per_layer_priority, np.array(new_global_deployment).tolist() + diff --git a/vllm_ascend/eplb/core/policy/mock_load_balance.py b/vllm_ascend/eplb/core/policy/mock_load_balance.py index 0ba2ca21ba..6626d3fb5c 100644 --- a/vllm_ascend/eplb/core/policy/mock_load_balance.py +++ b/vllm_ascend/eplb/core/policy/mock_load_balance.py @@ -19,7 +19,8 @@ def rebalance_experts(self, current_expert_table, expert_workload): for i in range(num_layers): # 随机选两个卡 - indices = random.sample(range(num_card), 2) + # indices = random.sample(range(num_card), 2) + indices = [3,1] # 交换冗余专家 expert_id_to_exchange = new_table[i][indices[0]][-1].clone() diff --git a/vllm_ascend/eplb/core/policy/policy_factory.py b/vllm_ascend/eplb/core/policy/policy_factory.py index 726d66ff86..0aa727bd99 100644 --- a/vllm_ascend/eplb/core/policy/policy_factory.py +++ b/vllm_ascend/eplb/core/policy/policy_factory.py @@ -1,7 +1,7 @@ # Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. from .eplb_policy import EplbPolicy, DynamicConfig from .mock_load_balance import MockLoadBalance -#from .mock_dynamic_ep import DynamicEP +from .dynamic_ep import DynamicEP @@ -10,6 +10,6 @@ class PolicyFactory: def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy: policy = { 0:MockLoadBalance , # MockLoadBalance - #1:DynamicEP, # When real eplb algorithm is ready, recover this + 1:DynamicEP, # When real eplb algorithm is ready, recover this } return policy.get(policy_type, MockLoadBalance)(config) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index e69de29bb2..4fbf007426 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -0,0 +1,284 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import time +import numpy as np +import torch +import torch_npu +import logging +import torch.distributed as dist +from multiprocessing import Process, Queue, Manager +from abc import ABC, abstractmethod +from vllm.logger import logger + +from vllm_ascend.eplb.core.policy.policy_factory import PolicyFactory, DynamicConfig + + +class EplbWorker: + + def __init__(self, shared_dict, policy_type, enable_d2d: bool = True): + self.policy_type = policy_type + self.policy = PolicyFactory.generate_policy(policy_type, DynamicConfig()) + self.shared_dict = shared_dict + self.old_expert_maps = None + self.enable_d2d = enable_d2d + + def do_update(self): + # put data in to queue + # in process self.policy.generate_policy() + # get epxert table && tensor + + # async stream + # D2D + # H2D + + # Get initial expert_map + if self.old_expert_maps is None: + self.old_expert_maps = self.get_init_expert_maps() + self.num_local_experts = self.old_expert_maps.max() + 1 + + # Get MOE load information + load_info = self.fetch_and_sum_load_info() + if load_info is None: + return + + #根据负载信息,获取更新后的专家表 + load_info, old_placement = self.global2local(load_info, self.old_expert_maps, self.num_local_experts) + changed, priority, new_placement = self.calculate_rebalance_experts(load_info, old_placement) + + if not torch.is_tensor(new_placement): + new_placement = torch.tensor(new_placement) + self.check_expert_placement(old_placement, new_placement) + new_expert_maps = self.local2global(new_placement) + + logger.debug(f"[EPLB Process new_map differs, performing D2D") + + update_info = self.compose_expert_update_info(new_expert_maps, self.old_expert_maps) + self.old_expert_maps = new_expert_maps + logger.info("EPLB Process complete") + + return update_info + + def check_expert_placement(self, old_placement, new_placement): + num_layers = old_placement.shape[0] + num_ranks = old_placement.shape[1] + + for layer_id in range(num_layers): + for rank_id in range(num_ranks): + new_placement_check = new_placement[layer_id][rank_id] + old_placement_check = old_placement[layer_id][rank_id] + + # check if same logical experts are placed on the same NPU + if new_placement_check.numel() != torch.unique(new_placement_check).numel(): + logger.error(f"Replicated experts are placed on the same NPU, expert placement on layer {layer_id}, rank {rank_id} is invalid") + new_placement[layer_id] = old_placement[layer_id] + break + + # check if there is any experts movement inside one NPU + expert_not_move = torch.isin(new_placement_check, old_placement_check) + if not torch.equal(new_placement_check[expert_not_move], old_placement_check[expert_not_move]): + logger.error(f"There exists expert movement inside NPU, expert placement on layer {layer_id}, rank {rank_id} is invalid") + new_placement[layer_id] = old_placement[layer_id] + break + + # TODO: Here only expert weight exchange is considered, need to be extended to cover other weight update cases + def compose_expert_update_info(self, updated_expert_maps, current_expert_maps): + num_layers = current_expert_maps.shape[0] + num_ranks = current_expert_maps.shape[1] + num_experts = current_expert_maps.shape[2] + + for layer_id in range(num_layers): + updated_expert_maps_this_layer = updated_expert_maps[layer_id] + current_expert_maps_this_layer = current_expert_maps[layer_id] + + expert_send_info_this_layer = dict() + expert_recv_info_this_layer = dict() + + # Guard Clause: if there is no expert weight update, avoid subsequent processing + if torch.equal(updated_expert_maps_this_layer, current_expert_maps_this_layer): + yield (expert_send_info_this_layer, expert_recv_info_this_layer, updated_expert_maps_this_layer, layer_id) + + # Parse expert_ids each rank needs to receive from other ranks + dst_rank_indices, experts_to_recv = torch.where((current_expert_maps_this_layer == -1) \ + & (updated_expert_maps_this_layer != -1)) + + # Parse expert_ids each rank needs to send to other ranks + src_rank_indices, experts_to_send = torch.where((current_expert_maps_this_layer != -1) \ + & (updated_expert_maps_this_layer == -1)) + + for idx in range(len(dst_rank_indices)): + dst_rank_id = dst_rank_indices[idx].item() + expert_id = experts_to_recv[idx].item() + if dst_rank_id not in expert_recv_info_this_layer: + expert_recv_info_this_layer[dst_rank_id] = [] + + if not torch.isin(torch.tensor(expert_id), experts_to_send).any(): + # if expert_id are not sent out from any npu, it will be copied from one npu holding this expert + candidate_src_rank_indices = torch.where(current_expert_maps_this_layer[:, expert_id] != -1) + else: + candidate_src_rank_indices = src_rank_indices[experts_to_send == expert_id] + + #TODO: improve selection criterion of npu sending expert_id considering such as intra-node or inter-node... + src_rank_id = candidate_src_rank_indices[0].item() + if src_rank_id not in expert_send_info_this_layer: + expert_send_info_this_layer[src_rank_id] = [] + + expert_send_info_this_layer[src_rank_id].append((dst_rank_id, expert_id)) + expert_recv_info_this_layer[dst_rank_id].append((src_rank_id, expert_id)) + + yield (expert_send_info_this_layer, expert_recv_info_this_layer, updated_expert_maps_this_layer, layer_id) + + + def calculate_rebalance_experts(self, load_info, old_placement): + """ + 通过 policy 实例的 rebalance_experts 方法计算 new_map。 + """ + if self.old_expert_maps is None: + return False, None, None + + changed, priority, new_map = self.policy.rebalance_experts(old_placement, load_info) + return changed, priority, new_map + + def get_init_expert_maps(self): + """ + Read the initial expert_map from shared_dict. + """ + return self.shared_dict.get("expert_maps", None) + + def fetch_and_sum_load_info(self): + """ + Each time the subprocess is awakened, read the latest moe_load + (shape: [num_moe_layers, num_experts_per_layer]) from shared_dict. + """ + return self.shared_dict.get("moe_load", None) + + def update_expert_map(self, expert_maps): + + self.shared_dict["expert_maps"] = expert_maps + + def global2local(self, + workload: torch.Tensor, + placement: torch.Tensor, + E_local: int + ) -> tuple[torch.Tensor, torch.Tensor]: + + L, G, _ = placement.shape + device = placement.device + + wt_local = torch.full((L, G, E_local), + fill_value=-1, + dtype=workload.dtype, + device=device) + pt_local = torch.full((L, G, E_local), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement >= 0 + l_idx, g_idx, k_idx = valid.nonzero(as_tuple=True) + + slot_idx = placement[l_idx, g_idx, k_idx] + values = workload[l_idx, g_idx, k_idx] + + wt_local[l_idx, g_idx, slot_idx] = values + pt_local[l_idx, g_idx, slot_idx] = k_idx + + return wt_local, pt_local + + + def local2global(self, + placement_local: torch.Tensor + ) -> torch.Tensor: + + L, G, E_local = placement_local.shape + device = placement_local.device + + max_id = torch.max(placement_local) + E_global = (max_id + 1).item() if max_id >= 0 else 0 + + if E_global == 0: + return torch.empty((L, G, 0), dtype=torch.long, device=device) + + placement_global = torch.full((L, G, E_global), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement_local >= 0 + l_idx, g_idx, slot_idx = valid.nonzero(as_tuple=True) + gid_idx = placement_local[l_idx, g_idx, slot_idx] + + placement_global[l_idx, g_idx, gid_idx] = slot_idx + + return placement_global + + +class EplbProcess: + def __init__(self, shared_dict, planner_q, block_update_q, policy_type: int = 0, enable_d2d: bool = True): + """ + Args: + shared_dict: Cross-process shared dict returned by Manager().dict() + policy_type: Integer passed to PolicyFactory.generate_policy + enable_d2d: Whether to enable D2D loading + """ + self.shared_dict = shared_dict + self.policy_type = policy_type + self.enable_d2d = enable_d2d + self.planner_q = planner_q + self.block_update_q = block_update_q + + # Create EplbWorker instance + self.worker = EplbWorker(self.shared_dict, self.policy_type, self.enable_d2d) + + + def worker_process(self,planner_q,block_update_q): + """ + Subprocess entry: bind to specified NPU, loop waiting for planner_q to wake up, call do_update, then notify main process update is complete. + """ + while True: + try: + + planner_q.get() + + update_info = self.worker.do_update() + + logger.info("update_info:", update_info) + + for (a,b,c,d) in update_info: + while True: + if not block_update_q.empty(): + continue + block_update_q.put((a,b,c,d)) + break + + except Exception as e: + logger.warning(f"[EPLB subprocess Exiting due to error: {e}", exc_info=True) + break + + def _launch_process(self): + """ + Use spawn method to launch subprocess and return (planner_q, block_update_q, proc). + """ + proc = Process( + target=self.worker_process, + args=(self.planner_q,self.block_update_q), + daemon=True + ) + + proc.start() + return proc + From 9d1893a2762cec3e929768cb187e346570812796 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Mon, 16 Jun 2025 10:18:39 +0800 Subject: [PATCH 064/153] fxi bugs --- vllm_ascend/worker/model_runner_v1.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 56c8a6b7fa..9c010be90b 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1340,9 +1340,6 @@ def execute_model( if self.dynamic_eplb: self.eplb_updator.forward_end() - if self.dynamic_eplb: - self.eplb_updator.forward_end() - return model_runner_output def _profile_multimodal(self) -> None: From 0992bee99f90a614ca50efed8a13af7bead7a353 Mon Sep 17 00:00:00 2001 From: raindaywhu Date: Mon, 16 Jun 2025 15:25:46 +0800 Subject: [PATCH 065/153] fix eplb update log --- vllm_ascend/eplb/core/loader/device_transfer_loader.py | 2 ++ vllm_ascend/eplb/core/worker/eplb_worker.py | 2 -- vllm_ascend/eplb/eplb_updator.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/eplb/core/loader/device_transfer_loader.py b/vllm_ascend/eplb/core/loader/device_transfer_loader.py index 17cc1500e3..d7afb47e5f 100644 --- a/vllm_ascend/eplb/core/loader/device_transfer_loader.py +++ b/vllm_ascend/eplb/core/loader/device_transfer_loader.py @@ -106,6 +106,8 @@ def update_expert_map_and_weight(self, reqs): for recv_expert_info in self.recv_expert_list: global_expert_id_to_recv, buffer_tensor_id = recv_expert_info self.eplb_adaptor.do_update_expert_weight(self.layer_id, global_expert_id_to_recv, buffer_tensor_id) + + logger.info(f"[EPLB] finished update expert weight for layer: {self.layer_id}") self.recv_expert_list = [] self.updated_expert_map = None diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 4fbf007426..a2683cfa8a 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -256,8 +256,6 @@ def worker_process(self,planner_q,block_update_q): update_info = self.worker.do_update() - logger.info("update_info:", update_info) - for (a,b,c,d) in update_info: while True: if not block_update_q.empty(): diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index def5af2257..217da2f0a9 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -94,7 +94,7 @@ def forward_before(self): self.eplb_loader.generate_log2phy_map(updated_expert_map) expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] - logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") + #logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") self.eplb_loader.generate_expert_d2d_transfer_task(expert_send_info_this_rank, expert_recv_info_this_rank, updated_expert_map[rank_id], layer_id + 3) self.weight_update_counter += 1 From 69b817ed654b9dbff9e614c69752c2b76d9fe7d7 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Mon, 16 Jun 2025 18:32:28 +0800 Subject: [PATCH 066/153] [CI] Add unit test framework (#1201) This PR added the unit test framework to enable ut for vLLM Ascend. Unit test runs on CPU machines. It'll be ran once lint check is passed the same as e2e test. For unit test, this PR created a new folder called `ut` under `tests` module. All the test file in `ut` should keep the same with the code in `vllm-ascend`. The file name should be start with `test_` prefix. For example, in this PR. the `test_ascend_config.py` is added for `ascend_config.py` test. A new fille `worker/test_worker_v1.py` is also added as the placeholder. This file should be the unit test for `vllm-ascend/worker/worker_v1.py`. Additional, a new `fake_weight` folder is added, it contains the config.json from `facebook/opt-125m`, so that the test will not always visit huggingface. TODO: We should add all the unit test file one by one in the future. Signed-off-by: wangxiyuan --- .github/workflows/vllm_ascend_test.yaml | 138 ++++++---- .../workflows/vllm_ascend_test_long_term.yaml | 12 +- format.sh | 2 +- .../long_term/spec_decode/__init__.py | 0 .../long_term/spec_decode/conftest.py | 0 .../long_term/spec_decode/e2e/__init__.py | 0 .../long_term/spec_decode/e2e/conftest.py | 0 .../e2e/test_medusa_correctness.py | 4 +- .../spec_decode/e2e/test_mlp_correctness.py | 4 +- .../spec_decode/e2e/test_mtp_correctness.py | 0 .../spec_decode/e2e/test_ngram_correctness.py | 4 +- .../e2e/test_v1_mtp_correctness.py | 0 .../spec_decode/e2e/test_v1_spec_decode.py | 0 .../spec_decode/test_dynamic_spec_decode.py | 4 +- .../spec_decode/test_multi_step_worker.py | 2 +- .../spec_decode/test_ngram_worker.py | 2 +- .../spec_decode/test_spec_decode_worker.py | 8 +- .../long_term/spec_decode/test_utils.py | 0 .../{ => e2e}/long_term/spec_decode/utils.py | 0 tests/{ => e2e}/long_term/test_accuracy.py | 0 .../test_deepseek_v2_lite_tp2_accuracy.py | 0 .../test_dynamic_npugraph_batchsize.py | 0 .../multicard/test_ilama_lora_tp2.py | 4 +- .../test_offline_inference_distributed.py | 0 .../multicard/test_pyhccl_distributed.py | 0 .../multicard/test_torchair_graph_mode.py | 0 tests/{ => e2e}/singlecard/__init__.py | 0 .../{ => e2e}/singlecard/compile/__init__.py | 0 .../singlecard/compile/test_simple.py | 0 tests/{ => e2e}/singlecard/core/__init__.py | 0 .../singlecard/core/test_ascend_scheduler.py | 0 .../core/test_ascend_scheduler_e2e.py | 0 tests/{ => e2e}/singlecard/ops/__init__.py | 0 .../singlecard/ops/test_fused_moe.py | 0 .../singlecard/ops/test_multi_step.py | 0 .../singlecard/ops/test_rotary_embedding.py | 0 .../ops/test_vocabparallelembedding.py | 0 tests/{ => e2e}/singlecard/sample/__init__.py | 0 .../sample/test_rejection_sampler.py | 0 tests/{ => e2e}/singlecard/test_aclgraph.py | 0 tests/{ => e2e}/singlecard/test_camem.py | 0 tests/{ => e2e}/singlecard/test_chunked.py | 0 .../singlecard/test_guided_decoding.py | 0 tests/{ => e2e}/singlecard/test_ilama_lora.py | 0 .../singlecard/test_offline_inference.py | 0 .../test_profile_execute_duration.py | 0 .../singlecard/test_prompt_embedding.py | 0 tests/{ => e2e}/singlecard/test_pyhccl.py | 0 tests/{ => e2e}/singlecard/test_sampler.py | 0 tests/{ => e2e}/singlecard/test_scheduler.py | 0 tests/singlecard/test_ascend_config.py | 191 -------------- tests/ut/fake_weight/config.json | 28 ++ tests/ut/test_ascend_config.py | 244 ++++++++++++++++++ tests/ut/worker/test_worker_v1.py | 1 + vllm_ascend/ascend_config.py | 6 - vllm_ascend/platform.py | 8 + vllm_ascend/utils.py | 1 + 57 files changed, 396 insertions(+), 267 deletions(-) rename tests/{ => e2e}/long_term/spec_decode/__init__.py (100%) rename tests/{ => e2e}/long_term/spec_decode/conftest.py (100%) rename tests/{ => e2e}/long_term/spec_decode/e2e/__init__.py (100%) rename tests/{ => e2e}/long_term/spec_decode/e2e/conftest.py (100%) rename tests/{ => e2e}/long_term/spec_decode/e2e/test_medusa_correctness.py (99%) rename tests/{ => e2e}/long_term/spec_decode/e2e/test_mlp_correctness.py (99%) rename tests/{ => e2e}/long_term/spec_decode/e2e/test_mtp_correctness.py (100%) rename tests/{ => e2e}/long_term/spec_decode/e2e/test_ngram_correctness.py (99%) rename tests/{ => e2e}/long_term/spec_decode/e2e/test_v1_mtp_correctness.py (100%) rename tests/{ => e2e}/long_term/spec_decode/e2e/test_v1_spec_decode.py (100%) rename tests/{ => e2e}/long_term/spec_decode/test_dynamic_spec_decode.py (96%) rename tests/{ => e2e}/long_term/spec_decode/test_multi_step_worker.py (99%) rename tests/{ => e2e}/long_term/spec_decode/test_ngram_worker.py (99%) rename tests/{ => e2e}/long_term/spec_decode/test_spec_decode_worker.py (99%) rename tests/{ => e2e}/long_term/spec_decode/test_utils.py (100%) rename tests/{ => e2e}/long_term/spec_decode/utils.py (100%) rename tests/{ => e2e}/long_term/test_accuracy.py (100%) rename tests/{ => e2e}/long_term/test_deepseek_v2_lite_tp2_accuracy.py (100%) rename tests/{ => e2e}/multicard/test_dynamic_npugraph_batchsize.py (100%) rename tests/{ => e2e}/multicard/test_ilama_lora_tp2.py (83%) rename tests/{ => e2e}/multicard/test_offline_inference_distributed.py (100%) rename tests/{ => e2e}/multicard/test_pyhccl_distributed.py (100%) rename tests/{ => e2e}/multicard/test_torchair_graph_mode.py (100%) rename tests/{ => e2e}/singlecard/__init__.py (100%) rename tests/{ => e2e}/singlecard/compile/__init__.py (100%) rename tests/{ => e2e}/singlecard/compile/test_simple.py (100%) rename tests/{ => e2e}/singlecard/core/__init__.py (100%) rename tests/{ => e2e}/singlecard/core/test_ascend_scheduler.py (100%) rename tests/{ => e2e}/singlecard/core/test_ascend_scheduler_e2e.py (100%) rename tests/{ => e2e}/singlecard/ops/__init__.py (100%) rename tests/{ => e2e}/singlecard/ops/test_fused_moe.py (100%) rename tests/{ => e2e}/singlecard/ops/test_multi_step.py (100%) rename tests/{ => e2e}/singlecard/ops/test_rotary_embedding.py (100%) rename tests/{ => e2e/singlecard}/ops/test_vocabparallelembedding.py (100%) rename tests/{ => e2e}/singlecard/sample/__init__.py (100%) rename tests/{ => e2e}/singlecard/sample/test_rejection_sampler.py (100%) rename tests/{ => e2e}/singlecard/test_aclgraph.py (100%) rename tests/{ => e2e}/singlecard/test_camem.py (100%) rename tests/{ => e2e}/singlecard/test_chunked.py (100%) rename tests/{ => e2e}/singlecard/test_guided_decoding.py (100%) rename tests/{ => e2e}/singlecard/test_ilama_lora.py (100%) rename tests/{ => e2e}/singlecard/test_offline_inference.py (100%) rename tests/{ => e2e}/singlecard/test_profile_execute_duration.py (100%) rename tests/{ => e2e}/singlecard/test_prompt_embedding.py (100%) rename tests/{ => e2e}/singlecard/test_pyhccl.py (100%) rename tests/{ => e2e}/singlecard/test_sampler.py (100%) rename tests/{ => e2e}/singlecard/test_scheduler.py (100%) delete mode 100644 tests/singlecard/test_ascend_config.py create mode 100644 tests/ut/fake_weight/config.json create mode 100644 tests/ut/test_ascend_config.py create mode 100644 tests/ut/worker/test_worker_v1.py diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index de085e4850..62c91de8cc 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # -name: 'e2e test / basic' +name: 'test' on: schedule: @@ -114,6 +114,56 @@ jobs: echo "::add-matcher::.github/workflows/matchers/mypy.json" tools/mypy.sh 1 ${{ matrix.python-version }} + ut: + needs: [lint] + name: unit test + if: ${{ needs.lint.result == 'success' }} + runs-on: ubuntu-latest + container: + image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 + env: + VLLM_LOGGING_LEVEL: ERROR + VLLM_USE_MODELSCOPE: True + strategy: + matrix: + vllm_version: [main, v0.9.1] + steps: + - name: Install packages + run: | + apt-get update -y + apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + ref: ${{ matrix.vllm_version }} + path: ./vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: ./vllm-empty + run: | + VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/ + python3 -m pip uninstall -y triton + + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v4 + + - name: Install vllm-project/vllm-ascend + run: | + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib + python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/ + python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/ + + - name: Run unit test for V1 Engine + env: + VLLM_USE_V1: 1 + VLLM_WORKER_MULTIPROC_METHOD: spawn + TORCH_DEVICE_BACKEND_AUTOLOAD: 0 + run: | + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib + pytest -sv tests/ut + e2e: needs: [lint] if: ${{ needs.lint.result == 'success' }} @@ -122,7 +172,7 @@ jobs: matrix: os: [linux-arm64-npu-1] vllm_version: [main, v0.9.1] - name: vLLM Ascend test + name: singlecard e2e test runs-on: ${{ matrix.os }} container: # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready @@ -168,53 +218,47 @@ jobs: pip install -r requirements-dev.txt pip install -v -e . - - name: Run vllm-project/vllm-ascend test for V1 Engine + - name: Run e2e test for V1 Engine env: VLLM_USE_V1: 1 VLLM_WORKER_MULTIPROC_METHOD: spawn VLLM_USE_MODELSCOPE: True run: | - pytest -sv tests/singlecard/test_offline_inference.py + pytest -sv tests/e2e/singlecard/test_offline_inference.py # TODO: switch hf to modelscope VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ - pytest -sv tests/singlecard/test_ilama_lora.py + pytest -sv tests/e2e/singlecard/test_ilama_lora.py # TODO(sss): guided decoding doesn't work, fix it later - # pytest -sv tests/singlecard/test_guided_decoding.py - # test_ascend_config.py should be ran separately because it will regenerate the global config many times. - pytest -sv tests/singlecard/test_ascend_config.py - pytest -sv tests/singlecard/test_camem.py - pytest -sv tests/singlecard/ \ - --ignore=tests/singlecard/test_offline_inference.py \ - --ignore=tests/singlecard/test_ilama_lora.py \ - --ignore=tests/singlecard/test_guided_decoding.py \ - --ignore=tests/singlecard/test_ascend_config.py \ - --ignore=tests/singlecard/test_camem.py + # pytest -sv tests/e2e/singlecard/test_guided_decoding.py + pytest -sv tests/e2e/singlecard/test_camem.py + pytest -sv tests/e2e/singlecard/ \ + --ignore=tests/e2e/singlecard/test_offline_inference.py \ + --ignore=tests/e2e/singlecard/test_ilama_lora.py \ + --ignore=tests/e2e/singlecard/test_guided_decoding.py \ + --ignore=tests/e2e/singlecard/test_camem.py - - name: Run vllm-project/vllm-ascend test on V0 engine + - name: Run e2e test on V0 engine if: ${{ github.event_name == 'schedule' }} env: VLLM_USE_V1: 0 VLLM_USE_MODELSCOPE: True run: | - pytest -sv tests/singlecard/test_offline_inference.py + pytest -sv tests/e2e/singlecard/test_offline_inference.py # TODO: switch hf to modelscope VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ - pytest -sv tests/singlecard/test_ilama_lora.py + pytest -sv tests/e2e/singlecard/test_ilama_lora.py # guided decoding doesn't work, fix it later - # pytest -sv tests/singlecard/test_guided_decoding.py - pytest -sv tests/singlecard/test_camem.py - # test_ascend_config.py should be ran separately because it will regenerate the global config many times. - pytest -sv tests/singlecard/test_ascend_config.py - pytest -sv tests/singlecard/test_prompt_embedding.py - pytest -sv tests/singlecard/ \ - --ignore=tests/singlecard/test_offline_inference.py \ - --ignore=tests/singlecard/test_ilama_lora.py \ - --ignore=tests/singlecard/test_guided_decoding.py \ - --ignore=tests/singlecard/test_camem.py \ - --ignore=tests/singlecard/test_ascend_config.py \ - --ignore=tests/singlecard/test_prompt_embedding.py \ - --ignore=tests/singlecard/core/test_ascend_scheduler.py \ - --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py + # pytest -sv tests/e2e/singlecard/test_guided_decoding.py + pytest -sv tests/e2e/singlecard/test_camem.py + pytest -sv tests/e2e/singlecard/test_prompt_embedding.py + pytest -sv tests/e2e/singlecard/ \ + --ignore=tests/e2e/singlecard/test_offline_inference.py \ + --ignore=tests/e2e/singlecard/test_ilama_lora.py \ + --ignore=tests/e2e/singlecard/test_guided_decoding.py \ + --ignore=tests/e2e/singlecard/test_camem.py \ + --ignore=tests/e2e/singlecard/test_prompt_embedding.py \ + --ignore=tests/e2e/singlecard/core/test_ascend_scheduler.py \ + --ignore=tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py e2e-4-cards: needs: [e2e] @@ -224,7 +268,7 @@ jobs: matrix: os: [linux-arm64-npu-4] vllm_version: [main, v0.9.1] - name: vLLM Ascend test + name: multicard e2e test runs-on: ${{ matrix.os }} container: # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready @@ -279,14 +323,14 @@ jobs: run: | # TODO: switch hf to modelscope VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ - pytest -sv tests/multicard/test_ilama_lora_tp2.py - # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error. + pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py + # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error. # To avoid oom, we need to run the test in a single process. - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 - pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 + pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py --ignore=tests/e2e/multicard/test_offline_inference_distributed.py - name: Run vllm-project/vllm-ascend test on V0 engine if: ${{ github.event_name == 'schedule' }} @@ -296,11 +340,11 @@ jobs: run: | # TODO: switch hf to modelscope VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ - pytest -sv tests/multicard/test_ilama_lora_tp2.py - # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error. + pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py + # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error. # To avoid oom, we need to run the test in a single process. - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk - pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 - pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 + pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py --ignore=tests/e2e/multicard/test_offline_inference_distributed.py diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index c17200a8d1..46123590ab 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -96,12 +96,12 @@ jobs: run: | if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then # spec decode test - VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py + VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed - # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_spec_decode.py - VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process - pytest -sv tests/long_term/spec_decode --ignore=tests/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py - pytest -sv tests/long_term/test_accuracy.py + # VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py + VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process + pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py + pytest -sv tests/e2e/long_term/test_accuracy.py else - VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py + VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py fi diff --git a/format.sh b/format.sh index d8a04069a8..e49ac915b9 100755 --- a/format.sh +++ b/format.sh @@ -273,7 +273,7 @@ echo 'vllm-ascend isort: Done' # Clang-format section # Exclude some files for formatting because they are vendored CLANG_FORMAT_EXCLUDES=( - 'csrc/kernels/pos_encoding_kernels.cpp' 'csrc/kernels/advance_step.cpp' 'csrc/torch_binding.cpp' 'csrc/ops.h' + 'csrc/kernels/pos_encoding_kernels.cpp' 'csrc/kernels/advance_step.cpp' 'csrc/kernels/get_masked_input_and_mask_kernel.cpp' 'csrc/torch_binding.cpp' 'csrc/ops.h' ) # Format specified files with clang-format diff --git a/tests/long_term/spec_decode/__init__.py b/tests/e2e/long_term/spec_decode/__init__.py similarity index 100% rename from tests/long_term/spec_decode/__init__.py rename to tests/e2e/long_term/spec_decode/__init__.py diff --git a/tests/long_term/spec_decode/conftest.py b/tests/e2e/long_term/spec_decode/conftest.py similarity index 100% rename from tests/long_term/spec_decode/conftest.py rename to tests/e2e/long_term/spec_decode/conftest.py diff --git a/tests/long_term/spec_decode/e2e/__init__.py b/tests/e2e/long_term/spec_decode/e2e/__init__.py similarity index 100% rename from tests/long_term/spec_decode/e2e/__init__.py rename to tests/e2e/long_term/spec_decode/e2e/__init__.py diff --git a/tests/long_term/spec_decode/e2e/conftest.py b/tests/e2e/long_term/spec_decode/e2e/conftest.py similarity index 100% rename from tests/long_term/spec_decode/e2e/conftest.py rename to tests/e2e/long_term/spec_decode/e2e/conftest.py diff --git a/tests/long_term/spec_decode/e2e/test_medusa_correctness.py b/tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py similarity index 99% rename from tests/long_term/spec_decode/e2e/test_medusa_correctness.py rename to tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py index c88ee76fa9..e0c2efd7af 100644 --- a/tests/long_term/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py @@ -41,9 +41,9 @@ import pytest -from tests.long_term.spec_decode.e2e.conftest import \ +from tests.e2e.long_term.spec_decode.e2e.conftest import \ run_equality_correctness_test -from tests.long_term.spec_decode.utils import maybe_enable_chunked_prefill +from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill # main model # lmsys/vicuna-7b-v1.3 was to be used but it's causing diff --git a/tests/long_term/spec_decode/e2e/test_mlp_correctness.py b/tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py similarity index 99% rename from tests/long_term/spec_decode/e2e/test_mlp_correctness.py rename to tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py index ee4e7ccd7d..56db617755 100644 --- a/tests/long_term/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py @@ -41,9 +41,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import \ pad_vocab_size # noqa: F401 -from tests.long_term.spec_decode.e2e.conftest import \ +from tests.e2e.long_term.spec_decode.e2e.conftest import \ run_equality_correctness_test -from tests.long_term.spec_decode.utils import maybe_enable_chunked_prefill +from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill # main model MAIN_MODEL = "JackFram/llama-160m" diff --git a/tests/long_term/spec_decode/e2e/test_mtp_correctness.py b/tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py similarity index 100% rename from tests/long_term/spec_decode/e2e/test_mtp_correctness.py rename to tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py diff --git a/tests/long_term/spec_decode/e2e/test_ngram_correctness.py b/tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py similarity index 99% rename from tests/long_term/spec_decode/e2e/test_ngram_correctness.py rename to tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py index 55454732d5..b99187fe37 100644 --- a/tests/long_term/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py @@ -44,9 +44,9 @@ import pytest -from tests.long_term.spec_decode.e2e.conftest import \ +from tests.e2e.long_term.spec_decode.e2e.conftest import \ run_equality_correctness_test -from tests.long_term.spec_decode.utils import maybe_enable_chunked_prefill +from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill @pytest.mark.parametrize( diff --git a/tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py b/tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py similarity index 100% rename from tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py rename to tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py diff --git a/tests/long_term/spec_decode/e2e/test_v1_spec_decode.py b/tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py similarity index 100% rename from tests/long_term/spec_decode/e2e/test_v1_spec_decode.py rename to tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py diff --git a/tests/long_term/spec_decode/test_dynamic_spec_decode.py b/tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py similarity index 96% rename from tests/long_term/spec_decode/test_dynamic_spec_decode.py rename to tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py index f9656f5f4d..8e9480ea26 100644 --- a/tests/long_term/spec_decode/test_dynamic_spec_decode.py +++ b/tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py @@ -27,8 +27,8 @@ from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from tests.long_term.spec_decode.test_utils import mock_spec_decode_sampler -from tests.long_term.spec_decode.utils import create_batch, mock_worker +from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler +from tests.e2e.long_term.spec_decode.utils import create_batch, mock_worker @pytest.mark.parametrize('queue_size', [4]) diff --git a/tests/long_term/spec_decode/test_multi_step_worker.py b/tests/e2e/long_term/spec_decode/test_multi_step_worker.py similarity index 99% rename from tests/long_term/spec_decode/test_multi_step_worker.py rename to tests/e2e/long_term/spec_decode/test_multi_step_worker.py index b9c2e0cc3d..b3017a987e 100644 --- a/tests/long_term/spec_decode/test_multi_step_worker.py +++ b/tests/e2e/long_term/spec_decode/test_multi_step_worker.py @@ -29,7 +29,7 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from tests.long_term.spec_decode.utils import ( +from tests.e2e.long_term.spec_decode.utils import ( assert_logprobs_dict_allclose, create_batch, create_seq_group_metadata_from_prompts, create_worker, patch_execute_model_with_seeds, zero_kv_cache) diff --git a/tests/long_term/spec_decode/test_ngram_worker.py b/tests/e2e/long_term/spec_decode/test_ngram_worker.py similarity index 99% rename from tests/long_term/spec_decode/test_ngram_worker.py rename to tests/e2e/long_term/spec_decode/test_ngram_worker.py index 1ad02bb1de..078a4d2bed 100644 --- a/tests/long_term/spec_decode/test_ngram_worker.py +++ b/tests/e2e/long_term/spec_decode/test_ngram_worker.py @@ -22,7 +22,7 @@ from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from tests.long_term.spec_decode.utils import ( +from tests.e2e.long_term.spec_decode.utils import ( create_seq_group_metadata_from_prompts, create_worker) diff --git a/tests/long_term/spec_decode/test_spec_decode_worker.py b/tests/e2e/long_term/spec_decode/test_spec_decode_worker.py similarity index 99% rename from tests/long_term/spec_decode/test_spec_decode_worker.py rename to tests/e2e/long_term/spec_decode/test_spec_decode_worker.py index b5abd1e123..94a1bcf1e7 100644 --- a/tests/long_term/spec_decode/test_spec_decode_worker.py +++ b/tests/e2e/long_term/spec_decode/test_spec_decode_worker.py @@ -35,10 +35,10 @@ from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, split_num_cache_blocks_evenly) -from tests.long_term.spec_decode.test_utils import mock_spec_decode_sampler -from tests.long_term.spec_decode.utils import (create_batch, - create_sampler_output_list, - create_worker, mock_worker) +from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler +from tests.e2e.long_term.spec_decode.utils import (create_batch, + create_sampler_output_list, + create_worker, mock_worker) from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner from vllm_ascend.worker.worker import NPUWorker diff --git a/tests/long_term/spec_decode/test_utils.py b/tests/e2e/long_term/spec_decode/test_utils.py similarity index 100% rename from tests/long_term/spec_decode/test_utils.py rename to tests/e2e/long_term/spec_decode/test_utils.py diff --git a/tests/long_term/spec_decode/utils.py b/tests/e2e/long_term/spec_decode/utils.py similarity index 100% rename from tests/long_term/spec_decode/utils.py rename to tests/e2e/long_term/spec_decode/utils.py diff --git a/tests/long_term/test_accuracy.py b/tests/e2e/long_term/test_accuracy.py similarity index 100% rename from tests/long_term/test_accuracy.py rename to tests/e2e/long_term/test_accuracy.py diff --git a/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py b/tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py similarity index 100% rename from tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py rename to tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py diff --git a/tests/multicard/test_dynamic_npugraph_batchsize.py b/tests/e2e/multicard/test_dynamic_npugraph_batchsize.py similarity index 100% rename from tests/multicard/test_dynamic_npugraph_batchsize.py rename to tests/e2e/multicard/test_dynamic_npugraph_batchsize.py diff --git a/tests/multicard/test_ilama_lora_tp2.py b/tests/e2e/multicard/test_ilama_lora_tp2.py similarity index 83% rename from tests/multicard/test_ilama_lora_tp2.py rename to tests/e2e/multicard/test_ilama_lora_tp2.py index e61ce250c8..e743141b7a 100644 --- a/tests/multicard/test_ilama_lora_tp2.py +++ b/tests/e2e/multicard/test_ilama_lora_tp2.py @@ -1,8 +1,8 @@ import pytest from tests.conftest import VllmRunner -from tests.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, MODEL_PATH, - do_sample) +from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, + MODEL_PATH, do_sample) @pytest.mark.parametrize("distributed_executor_backend", ["mp"]) diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py similarity index 100% rename from tests/multicard/test_offline_inference_distributed.py rename to tests/e2e/multicard/test_offline_inference_distributed.py diff --git a/tests/multicard/test_pyhccl_distributed.py b/tests/e2e/multicard/test_pyhccl_distributed.py similarity index 100% rename from tests/multicard/test_pyhccl_distributed.py rename to tests/e2e/multicard/test_pyhccl_distributed.py diff --git a/tests/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py similarity index 100% rename from tests/multicard/test_torchair_graph_mode.py rename to tests/e2e/multicard/test_torchair_graph_mode.py diff --git a/tests/singlecard/__init__.py b/tests/e2e/singlecard/__init__.py similarity index 100% rename from tests/singlecard/__init__.py rename to tests/e2e/singlecard/__init__.py diff --git a/tests/singlecard/compile/__init__.py b/tests/e2e/singlecard/compile/__init__.py similarity index 100% rename from tests/singlecard/compile/__init__.py rename to tests/e2e/singlecard/compile/__init__.py diff --git a/tests/singlecard/compile/test_simple.py b/tests/e2e/singlecard/compile/test_simple.py similarity index 100% rename from tests/singlecard/compile/test_simple.py rename to tests/e2e/singlecard/compile/test_simple.py diff --git a/tests/singlecard/core/__init__.py b/tests/e2e/singlecard/core/__init__.py similarity index 100% rename from tests/singlecard/core/__init__.py rename to tests/e2e/singlecard/core/__init__.py diff --git a/tests/singlecard/core/test_ascend_scheduler.py b/tests/e2e/singlecard/core/test_ascend_scheduler.py similarity index 100% rename from tests/singlecard/core/test_ascend_scheduler.py rename to tests/e2e/singlecard/core/test_ascend_scheduler.py diff --git a/tests/singlecard/core/test_ascend_scheduler_e2e.py b/tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py similarity index 100% rename from tests/singlecard/core/test_ascend_scheduler_e2e.py rename to tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py diff --git a/tests/singlecard/ops/__init__.py b/tests/e2e/singlecard/ops/__init__.py similarity index 100% rename from tests/singlecard/ops/__init__.py rename to tests/e2e/singlecard/ops/__init__.py diff --git a/tests/singlecard/ops/test_fused_moe.py b/tests/e2e/singlecard/ops/test_fused_moe.py similarity index 100% rename from tests/singlecard/ops/test_fused_moe.py rename to tests/e2e/singlecard/ops/test_fused_moe.py diff --git a/tests/singlecard/ops/test_multi_step.py b/tests/e2e/singlecard/ops/test_multi_step.py similarity index 100% rename from tests/singlecard/ops/test_multi_step.py rename to tests/e2e/singlecard/ops/test_multi_step.py diff --git a/tests/singlecard/ops/test_rotary_embedding.py b/tests/e2e/singlecard/ops/test_rotary_embedding.py similarity index 100% rename from tests/singlecard/ops/test_rotary_embedding.py rename to tests/e2e/singlecard/ops/test_rotary_embedding.py diff --git a/tests/ops/test_vocabparallelembedding.py b/tests/e2e/singlecard/ops/test_vocabparallelembedding.py similarity index 100% rename from tests/ops/test_vocabparallelembedding.py rename to tests/e2e/singlecard/ops/test_vocabparallelembedding.py diff --git a/tests/singlecard/sample/__init__.py b/tests/e2e/singlecard/sample/__init__.py similarity index 100% rename from tests/singlecard/sample/__init__.py rename to tests/e2e/singlecard/sample/__init__.py diff --git a/tests/singlecard/sample/test_rejection_sampler.py b/tests/e2e/singlecard/sample/test_rejection_sampler.py similarity index 100% rename from tests/singlecard/sample/test_rejection_sampler.py rename to tests/e2e/singlecard/sample/test_rejection_sampler.py diff --git a/tests/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py similarity index 100% rename from tests/singlecard/test_aclgraph.py rename to tests/e2e/singlecard/test_aclgraph.py diff --git a/tests/singlecard/test_camem.py b/tests/e2e/singlecard/test_camem.py similarity index 100% rename from tests/singlecard/test_camem.py rename to tests/e2e/singlecard/test_camem.py diff --git a/tests/singlecard/test_chunked.py b/tests/e2e/singlecard/test_chunked.py similarity index 100% rename from tests/singlecard/test_chunked.py rename to tests/e2e/singlecard/test_chunked.py diff --git a/tests/singlecard/test_guided_decoding.py b/tests/e2e/singlecard/test_guided_decoding.py similarity index 100% rename from tests/singlecard/test_guided_decoding.py rename to tests/e2e/singlecard/test_guided_decoding.py diff --git a/tests/singlecard/test_ilama_lora.py b/tests/e2e/singlecard/test_ilama_lora.py similarity index 100% rename from tests/singlecard/test_ilama_lora.py rename to tests/e2e/singlecard/test_ilama_lora.py diff --git a/tests/singlecard/test_offline_inference.py b/tests/e2e/singlecard/test_offline_inference.py similarity index 100% rename from tests/singlecard/test_offline_inference.py rename to tests/e2e/singlecard/test_offline_inference.py diff --git a/tests/singlecard/test_profile_execute_duration.py b/tests/e2e/singlecard/test_profile_execute_duration.py similarity index 100% rename from tests/singlecard/test_profile_execute_duration.py rename to tests/e2e/singlecard/test_profile_execute_duration.py diff --git a/tests/singlecard/test_prompt_embedding.py b/tests/e2e/singlecard/test_prompt_embedding.py similarity index 100% rename from tests/singlecard/test_prompt_embedding.py rename to tests/e2e/singlecard/test_prompt_embedding.py diff --git a/tests/singlecard/test_pyhccl.py b/tests/e2e/singlecard/test_pyhccl.py similarity index 100% rename from tests/singlecard/test_pyhccl.py rename to tests/e2e/singlecard/test_pyhccl.py diff --git a/tests/singlecard/test_sampler.py b/tests/e2e/singlecard/test_sampler.py similarity index 100% rename from tests/singlecard/test_sampler.py rename to tests/e2e/singlecard/test_sampler.py diff --git a/tests/singlecard/test_scheduler.py b/tests/e2e/singlecard/test_scheduler.py similarity index 100% rename from tests/singlecard/test_scheduler.py rename to tests/e2e/singlecard/test_scheduler.py diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py deleted file mode 100644 index 63484d4a01..0000000000 --- a/tests/singlecard/test_ascend_config.py +++ /dev/null @@ -1,191 +0,0 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -import pytest - -from tests.conftest import VllmRunner -from vllm_ascend.ascend_config import (clear_ascend_config, get_ascend_config, - init_ascend_config) - - -def _clean_up_ascend_config(func): - - def wrapper(*args, **kwargs): - clear_ascend_config() - func(*args, **kwargs) - clear_ascend_config() - - return wrapper - - -@_clean_up_ascend_config -def test_run_without_ascend_config(): - with VllmRunner("facebook/opt-125m"): - ascend_config = get_ascend_config() - - assert not ascend_config.torchair_graph_config.enabled - assert not ascend_config.torchair_graph_config.use_cached_graph - assert ascend_config.torchair_graph_config.graph_batch_sizes == [] - assert not ascend_config.torchair_graph_config.graph_batch_sizes_init - assert not ascend_config.ascend_scheduler_config.enabled - assert ascend_config.expert_tensor_parallel_size == 0 - - -@_clean_up_ascend_config -def test_run_with_ascend_config(): - if os.getenv("VLLM_USE_V1") == "0": - pytest.skip("graph only works on v1") - - input_additional_config_1 = { - "torchair_graph_config": { - # torchair graph only works with deepseek. The e2e test should be added - # in multicard test with deepseek models. - "enabled": False, - "use_cached_graph": True, - "graph_batch_sizes": [1, 2, 4, 8], - "graph_batch_sizes_init": False, - "enable_multistream_moe": True, - "enable_multistream_mla": True, - }, - "ascend_scheduler_config": { - "enabled": True, - "enable_chunked_prefill": True, - }, - "expert_tensor_parallel_size": 1 - } - - # check passed with eager mode - with VllmRunner("facebook/opt-125m", - enforce_eager=True, - additional_config=input_additional_config_1): - ascend_config = get_ascend_config() - - assert not ascend_config.torchair_graph_config.enabled - assert ascend_config.torchair_graph_config.use_cached_graph - assert ascend_config.torchair_graph_config.graph_batch_sizes == [ - 1, 2, 4, 8 - ] - assert not ascend_config.torchair_graph_config.graph_batch_sizes_init - assert ascend_config.torchair_graph_config.enable_multistream_mla - assert ascend_config.torchair_graph_config.enable_multistream_moe - assert ascend_config.ascend_scheduler_config.enabled - assert ascend_config.ascend_scheduler_config.enable_chunked_prefill - assert ascend_config.expert_tensor_parallel_size == 1 - - -@_clean_up_ascend_config -def test_ascend_config_init_error(): - # ascend_config should be initialized first - with pytest.raises(RuntimeError): - _ = get_ascend_config() - - -@_clean_up_ascend_config -def test_ascend_config_load_error(): - if os.getenv("VLLM_USE_V1") == "0": - pytest.skip("graph only works on v1") - # graph_batch_sizes should be list. - with pytest.raises(TypeError): - input_additional_config_fake_1 = { - "torchair_graph_config": { - "graph_batch_sizes": "fake_size", - }, - } - with VllmRunner("facebook/opt-125m", - additional_config=input_additional_config_fake_1): - pass - - # graph_batch_sizes_init should not be True when graph_batch_sizes is not empty. - with pytest.raises(ValueError): - input_additional_config_fake_2 = { - "torchair_graph_config": { - "graph_batch_sizes": [1, 2, 4, 8], - "graph_batch_sizes_init": True, - }, - } - with VllmRunner("facebook/opt-125m", - additional_config=input_additional_config_fake_2): - pass - - # torchair graph only works with deepseek. - with pytest.raises(NotImplementedError): - input_additional_config_fake_2 = { - "torchair_graph_config": { - "enabled": True, - }, - } - with VllmRunner("facebook/opt-125m", - enforce_eager=False, - additional_config=input_additional_config_fake_2): - pass - - # torchair graph should not be enabled with eager mode - with pytest.raises(RuntimeError): - input_additional_config_fake_3 = { - "torchair_graph_config": { - "enabled": True, - }, - } - with VllmRunner("facebook/opt-125m", - enforce_eager=True, - additional_config=input_additional_config_fake_3): - pass - - -@_clean_up_ascend_config -def test_check_ascend_config_v0(): - if os.getenv("VLLM_USE_V1") == "1": - pytest.skip("graph only works on v1, this is the test for v0") - with pytest.raises(NotImplementedError): - input_additional_config_fake_1 = { - "torchair_graph_config": { - "enabled": True, - }, - } - with VllmRunner("facebook/opt-125m", - additional_config=input_additional_config_fake_1): - pass - - -@_clean_up_ascend_config -def test_ascend_config_refresh(): - from vllm.config import get_current_vllm_config - vllm_config = get_current_vllm_config() - # set additional_config with none - init_ascend_config(vllm_config) - - input_additional_config = { - "torchair_graph_config": { - "enabled": False, - "use_cached_graph": True, - "graph_batch_sizes": [1, 2, 4, 8], - "graph_batch_sizes_init": False, - }, - "refresh": True, - } - - # refresh ascend config - with VllmRunner("facebook/opt-125m", - additional_config=input_additional_config): - ascend_config = get_ascend_config() - - assert not ascend_config.torchair_graph_config.enabled - assert ascend_config.torchair_graph_config.use_cached_graph - assert ascend_config.torchair_graph_config.graph_batch_sizes == [ - 1, 2, 4, 8 - ] - assert not ascend_config.torchair_graph_config.graph_batch_sizes_init diff --git a/tests/ut/fake_weight/config.json b/tests/ut/fake_weight/config.json new file mode 100644 index 0000000000..b3fb716a30 --- /dev/null +++ b/tests/ut/fake_weight/config.json @@ -0,0 +1,28 @@ +{ + "_name_or_path": "facebook/opt-125m", + "activation_dropout": 0.0, + "activation_function": "relu", + "architectures": [ + "OPTForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 2, + "do_layer_norm_before": true, + "dropout": 0.1, + "eos_token_id": 2, + "ffn_dim": 3072, + "hidden_size": 768, + "init_std": 0.02, + "layerdrop": 0.0, + "max_position_embeddings": 2048, + "model_type": "opt", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 1, + "prefix": "", + "torch_dtype": "float16", + "transformers_version": "4.21.0.dev0", + "use_cache": true, + "vocab_size": 50272, + "word_embed_proj_dim": 768 +} diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py new file mode 100644 index 0000000000..5ec4dd72cc --- /dev/null +++ b/tests/ut/test_ascend_config.py @@ -0,0 +1,244 @@ +import os +import unittest +from unittest import mock + +from transformers import PretrainedConfig +from vllm.config import ModelConfig, VllmConfig + +from vllm_ascend.ascend_config import (check_ascend_config, + clear_ascend_config, get_ascend_config, + init_ascend_config) + + +class TestAscendConfig(unittest.TestCase): + + @staticmethod + def _clean_up_ascend_config(func): + + def wrapper(*args, **kwargs): + clear_ascend_config() + func(*args, **kwargs) + clear_ascend_config() + + return wrapper + + @_clean_up_ascend_config + def test_init_ascend_config_without_additional_config(self): + test_vllm_config = VllmConfig() + # No additional config given, check the default value here. + ascend_config = init_ascend_config(test_vllm_config) + self.assertEqual(ascend_config.expert_tensor_parallel_size, 0) + self.assertIsNone(ascend_config.expert_map_path) + + torchair_graph_config = ascend_config.torchair_graph_config + self.assertFalse(torchair_graph_config.enabled) + self.assertFalse(torchair_graph_config.use_cached_graph) + self.assertEqual(torchair_graph_config.graph_batch_sizes, []) + self.assertFalse(torchair_graph_config.graph_batch_sizes_init) + self.assertFalse(torchair_graph_config.enable_multistream_mla) + self.assertFalse(torchair_graph_config.enable_multistream_moe) + self.assertTrue(torchair_graph_config.enable_view_optimize) + self.assertFalse(torchair_graph_config.enable_kv_nz) + + ascend_scheduler_config = ascend_config.ascend_scheduler_config + self.assertFalse(ascend_scheduler_config.enabled) + + @_clean_up_ascend_config + def test_init_ascend_config_with_additional_config(self): + test_vllm_config = VllmConfig() + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + "use_cached_graph": True, + "graph_batch_sizes": [1, 2, 4], + "graph_batch_sizes_init": False, + "enable_multistream_mla": True, + "enable_multistream_moe": True, + "enable_view_optimize": True, + "enable_kv_nz": True + }, + "ascend_scheduler_config": { + "enabled": True + }, + "expert_tensor_parallel_size": 1, + "expert_map_path": "test_expert_map_path", + "refresh": True + } + ascend_config = init_ascend_config(test_vllm_config) + self.assertEqual(ascend_config.expert_tensor_parallel_size, 1) + self.assertEqual(ascend_config.expert_map_path, "test_expert_map_path") + + torchair_graph_config = ascend_config.torchair_graph_config + self.assertTrue(torchair_graph_config.enabled) + self.assertTrue(torchair_graph_config.use_cached_graph) + self.assertEqual(torchair_graph_config.graph_batch_sizes, [1, 2, 4]) + self.assertFalse(torchair_graph_config.graph_batch_sizes_init) + self.assertTrue(torchair_graph_config.enable_multistream_mla) + self.assertTrue(torchair_graph_config.enable_multistream_moe) + self.assertTrue(torchair_graph_config.enable_view_optimize) + self.assertTrue(torchair_graph_config.enable_kv_nz) + + ascend_scheduler_config = ascend_config.ascend_scheduler_config + self.assertTrue(ascend_scheduler_config.enabled) + + @_clean_up_ascend_config + def test_init_ascend_config_with_refresh(self): + test_vllm_config = VllmConfig() + ascend_config = init_ascend_config(test_vllm_config) + self.assertFalse(ascend_config.torchair_graph_config.enabled) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + } + ascend_config = init_ascend_config(test_vllm_config) + self.assertFalse(ascend_config.torchair_graph_config.enabled) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True, + } + ascend_config = init_ascend_config(test_vllm_config) + self.assertTrue(ascend_config.torchair_graph_config.enabled) + + @_clean_up_ascend_config + def test_init_ascend_config_with_wrong_input(self): + test_vllm_config = VllmConfig() + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + "graph_batch_sizes": "fake_size", + }, + "refresh": True, + } + with self.assertRaises(TypeError): + init_ascend_config(test_vllm_config) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + "graph_batch_sizes": [1, 2, 4, 8], + "graph_batch_sizes_init": True, + }, + "refresh": True, + } + with self.assertRaises(ValueError): + init_ascend_config(test_vllm_config) + + @_clean_up_ascend_config + def test_get_ascend_config(self): + test_vllm_config = VllmConfig() + ascend_config = init_ascend_config(test_vllm_config) + self.assertEqual(get_ascend_config(), ascend_config) + + @_clean_up_ascend_config + def test_get_ascend_config_without_init(self): + with self.assertRaises(RuntimeError): + get_ascend_config() + + @_clean_up_ascend_config + def test_clear_ascend_config(self): + test_vllm_config = VllmConfig() + ascend_config = init_ascend_config(test_vllm_config) + self.assertEqual(get_ascend_config(), ascend_config) + clear_ascend_config() + with self.assertRaises(RuntimeError): + get_ascend_config() + + @_clean_up_ascend_config + def test_check_ascend_config_pass(self): + test_vllm_config = VllmConfig() + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + + # For V1 engine + with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + + @_clean_up_ascend_config + def test_check_ascend_config_wrong_case(self): + test_vllm_config = VllmConfig() + # For V0 engine + with mock.patch.dict(os.environ, {"VLLM_USE_V1": "0"}): + with self.assertRaises(NotImplementedError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + with self.assertRaises(NotImplementedError): + test_vllm_config.additional_config = { + "ascend_scheduler_config": { + "enabled": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, True) + # For V1 engine + with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}): + # torchair + eager mode + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + enforce_eager = True + check_ascend_config(test_vllm_config, enforce_eager) + # torchair + non deepseek model + with self.assertRaises(NotImplementedError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + model_path = os.path.join(os.path.dirname(__file__), + "fake_weight") + fake_model_config = ModelConfig(model=model_path) + fake_model_config.hf_config = PretrainedConfig() + fake_model_config.hf_config.model_type = "llama" + test_vllm_config.model_config = fake_model_config + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + # aclgraph + deepseek model + with self.assertRaises(NotImplementedError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + }, + "refresh": True + } + model_path = os.path.join(os.path.dirname(__file__), + "fake_weight") + fake_model_config = ModelConfig(model=model_path) + fake_model_config.hf_config = PretrainedConfig() + fake_model_config.hf_config.model_type = "deepseek" + test_vllm_config.model_config = fake_model_config + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py new file mode 100644 index 0000000000..fdffa2a0fd --- /dev/null +++ b/tests/ut/worker/test_worker_v1.py @@ -0,0 +1 @@ +# placeholder diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index defa7fd3d8..d8b87c6952 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -138,12 +138,6 @@ def check_ascend_config(vllm_config, enforce_eager): else: # torchair_graph case if ascend_config.torchair_graph_config.enabled: - # torchair_graph is not supported for V1 without mla currently. - if envs.VLLM_MLA_DISABLE: - logger.warning( - "Torchair graph mode is still experimental and not supported for V1 without mla currently, " - "it has been disabled automatically.") - ascend_config.torchair_graph_config.enabled = False # torchair_graph is supported for deepseek model only currently. if vllm_config.model_config: model_type = vllm_config.model_config.hf_config.model_type diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 912e375f70..9194ae9a16 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -164,6 +164,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: else: enforce_eager = getattr(model_config, "enforce_eager", False) + if ascend_config.torchair_graph_config.enabled and envs.VLLM_MLA_DISABLE: + # torchair_graph is not supported for V1 without mla currently. + logger.warning( + "Torchair graph mode is still experimental and not supported for V1 without mla currently, " + "Fallback to eager mode.") + ascend_config.torchair_graph_config.enabled = False + enforce_eager = True + check_ascend_config(vllm_config, enforce_eager) if enforce_eager or compilation_config.level == CompilationLevel.NO_COMPILATION: diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index f41dab4b94..e29290e73a 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -24,6 +24,7 @@ from typing import TYPE_CHECKING, List, Tuple import torch +import torch_npu # noqa: F401 import torchair # type: ignore[import] # noqa: F401 from packaging.version import InvalidVersion, Version from torch_npu.npu.streams import Event From 0c6210f7ea7f4bde4a7b62c2912d9ac4aef62ddb Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Mon, 16 Jun 2025 20:48:56 +0800 Subject: [PATCH 067/153] fix bugsw --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 50 ++++++++++++++++++------ 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index b56468a66c..4af0673d01 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -17,9 +17,12 @@ import torch import torch.distributed as dist +import numpy as np from vllm_ascend.eplb.adaptor.abstract_adaptor import EplbAdaptor from vllm.logger import logger +import random + class VllmEplbAdaptor(EplbAdaptor): @@ -29,6 +32,7 @@ def __init__(self, model, **args): self.param_dict = dict(self.model.named_parameters()) self.num_dense_layers = self.model.config.first_k_dense_replace self.num_moe_layers = self.model.config.num_hidden_layers - self.num_dense_layers + self.global_expert_num = 256 # TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 is supported here self.expert_weight_names = ["w13_weight", "w2_weight", "w13_weight_scale", "w13_weight_offset", @@ -72,18 +76,16 @@ def get_init_expert_map(self, num_moe_layers): expert_map = self.model.get_all_expert_map(num_moe_layers) if dist.is_initialized(): world_size = dist.get_world_size() + rank = dist.get_rank() - rank = dist.get_rank() - - tensor_list = [ - torch.zeros_like(expert_map) for _ in range(world_size) - ] + gathered = torch.empty((world_size, *expert_map.shape), # [W, L, E] + dtype=expert_map.dtype, + device=expert_map.device) - dist.all_gather(tensor_list, expert_map) - gathered = torch.stack(tensor_list, dim=0) - all_maps = gathered.permute(1, 0, 2).contiguous() + dist.all_gather_into_tensor(gathered, expert_map) + all_maps = gathered.permute(1, 0, 2) + all_expert_maps = all_maps.cpu() - all_expert_maps = all_maps.to(torch.device("cpu")) return all_expert_maps def do_update_expert_map(self, layer_id, updated_expert_map): @@ -111,6 +113,8 @@ def generate_index_dicts(self,tensor_2d): return dict_list def generate_log2phy_map(self, expert_map): + num_local_experts = expert_map.max() + 1 + expert_map = self.global2local(expert_map,num_local_experts) ranks_num, global_expert_num = expert_map.shape concatenated = torch.flatten(expert_map) rank_expert_to_global = self.generate_index_dicts( @@ -122,7 +126,7 @@ def generate_log2phy_map(self, expert_map): result_dict[key] = [] result_dict[key].append(idx) - log2phy_map = torch.full((ranks_num, global_expert_num), + log2phy_map = torch.full((ranks_num, self.global_expert_num), -1, dtype=torch.int32) for rank in range(ranks_num): @@ -136,7 +140,27 @@ def generate_log2phy_map(self, expert_map): return log2phy_map def do_update_log2phy_map(self, layer_id, updated_log2phy_map): - + rank_id = torch.distributed.get_rank() if self.log2phy_map_per_layer[layer_id] is not None: - rank_id = torch.distributed.get_rank() - self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map[rank_id]) \ No newline at end of file + self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map[rank_id]) + + def global2local(self, + placement: torch.Tensor, + E_local: int + ) -> tuple[torch.Tensor, torch.Tensor]: + + G, _ = placement.shape + device = placement.device + + pt_local = torch.full(( G, E_local), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement >= 0 + g_idx, k_idx = valid.nonzero(as_tuple=True) + slot_idx = placement[g_idx, k_idx] + + pt_local[g_idx, slot_idx] = k_idx + + return pt_local From f5404dc650882c6f0423db9e87f9b38f756211c5 Mon Sep 17 00:00:00 2001 From: zhuo97 <49392868+zhuo97@users.noreply.github.com> Date: Mon, 16 Jun 2025 21:03:16 +0800 Subject: [PATCH 068/153] Fix the device error when using ray as vllm-acend backend (#884) 1. Remove RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES 2. Add lazy init for vllm_ascend_C Signed-off-by: zhuo97 <1103045176@qq.com> --- examples/offline_multi_step_custom_ops.py | 3 --- .../singlecard/ops/test_rotary_embedding.py | 4 ++- vllm_ascend/attention/attention.py | 4 +-- vllm_ascend/ops/rotary_embedding.py | 5 ++-- vllm_ascend/platform.py | 12 --------- vllm_ascend/utils.py | 27 +++++++++++++++++++ vllm_ascend/worker/worker_v1.py | 5 ++++ 7 files changed, 40 insertions(+), 20 deletions(-) diff --git a/examples/offline_multi_step_custom_ops.py b/examples/offline_multi_step_custom_ops.py index 82a1bf5759..59c7fafccd 100644 --- a/examples/offline_multi_step_custom_ops.py +++ b/examples/offline_multi_step_custom_ops.py @@ -19,9 +19,6 @@ from vllm import LLM, SamplingParams -import vllm_ascend.platform as pf - -pf.CUSTOM_OP_ENABLED = True # set True for custom Ops of Multi-Step. prompts = [ "Hello, my name is", "The president of the United States is", diff --git a/tests/e2e/singlecard/ops/test_rotary_embedding.py b/tests/e2e/singlecard/ops/test_rotary_embedding.py index 2d5ec18daf..a3504a88b2 100644 --- a/tests/e2e/singlecard/ops/test_rotary_embedding.py +++ b/tests/e2e/singlecard/ops/test_rotary_embedding.py @@ -10,7 +10,9 @@ import torch import torch.nn as nn -import vllm_ascend.platform # noqa: F401 +from vllm_ascend.utils import enable_custom_op + +enable_custom_op() # Only Neox style true scenario is supported for now IS_NEOX_STYLE = [True] diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py index a567cc5306..9c112ed620 100644 --- a/vllm_ascend/attention/attention.py +++ b/vllm_ascend/attention/attention.py @@ -36,7 +36,7 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ops.cache import concat_and_cache_mla -from vllm_ascend.platform import CUSTOM_OP_ENABLED +from vllm_ascend.utils import enable_custom_op from vllm_ascend.worker.model_runner import ( ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata) @@ -462,7 +462,7 @@ def advance_step(self, for i in range(num_queries): self.seq_lens[i] += 1 self.max_decode_seq_len = max(self.seq_lens) - if CUSTOM_OP_ENABLED: + if enable_custom_op(): #advance a step on NPU for existing inputs for a multi-step runner if custom ops is enabled torch.ops._C.advance_step_flashattn_ascendc( num_seqs=num_seqs, diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 0c2a00afb6..9f8ae784c5 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -22,11 +22,12 @@ from vllm.model_executor.layers.rotary_embedding import ( DeepseekScalingRotaryEmbedding, RotaryEmbedding) -from vllm_ascend.platform import CUSTOM_OP_ENABLED +from vllm_ascend.utils import enable_custom_op def custom_rotary_embedding_enabled(query, neox_style, head_size): - return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and CUSTOM_OP_ENABLED + return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and enable_custom_op( + ) def rope_forward_oot( diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 9194ae9a16..5a45e9eed7 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -16,7 +16,6 @@ # import gc -import logging import os from datetime import timedelta from typing import TYPE_CHECKING, Optional, Tuple @@ -32,16 +31,6 @@ from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes -CUSTOM_OP_ENABLED = False -try: - # register custom ops into torch_library here - import vllm_ascend.vllm_ascend_C # type: ignore # noqa: F401 - CUSTOM_OP_ENABLED = True -except ImportError as e: - logging.warning( - "Failed to import 'vllm_ascend.vllm_ascend_C': %s. All custom ops will be disabled. ", - e) - if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig from vllm.utils import FlexibleArgumentParser @@ -50,7 +39,6 @@ VllmConfig = None FlexibleArgumentParser = None -os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1" os.environ["ACL_OP_INIT_MODE"] = ascend_envs.VLLM_ASCEND_ACL_OP_INIT_MODE diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index e29290e73a..d932053520 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -54,6 +54,8 @@ ASCEND_QUATIZATION_METHOD = "ascend" +CUSTOM_OP_ENABLED = None + def try_register_lib(lib_name: str, lib_info: str = ""): import importlib @@ -68,6 +70,31 @@ def try_register_lib(lib_name: str, lib_info: str = ""): pass +def enable_custom_op(): + """ + Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component. + Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device(). + """ + global CUSTOM_OP_ENABLED + + if CUSTOM_OP_ENABLED is not None: + return CUSTOM_OP_ENABLED + + else: + try: + # register custom ops into torch_library here + import vllm_ascend.vllm_ascend_C # type: ignore # noqa: F401 + CUSTOM_OP_ENABLED = True + + except ImportError: + CUSTOM_OP_ENABLED = False + logger.warning( + "Warning: Failed to register custom ops, all custom ops will be disabled" + ) + + return CUSTOM_OP_ENABLED + + def find_hccl_library() -> str: """ We either use the library file specified by the `HCCL_SO_PATH` diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index ba29dcfc56..ebdf01e155 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -117,6 +117,11 @@ def wake_up(self, tags: Optional[list[str]] = None) -> None: allocator = CaMemAllocator.get_instance() allocator.wake_up(tags=tags) + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + def init_device(self): if self.device_config.device.type == "npu": self.device = torch.device(f"npu:{self.local_rank_across_dp}") From 9d6394938bc7c4037a537c0097cf492eee289f8f Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Mon, 16 Jun 2025 22:09:22 +0800 Subject: [PATCH 069/153] improve the implement of communication between main process and eplb process --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 6 +++--- .../eplb/core/policy/policy_factory.py | 2 +- vllm_ascend/eplb/core/worker/eplb_worker.py | 21 +++++++++++-------- vllm_ascend/eplb/eplb_updator.py | 8 ++++++- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 4af0673d01..d42482a37a 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -32,7 +32,7 @@ def __init__(self, model, **args): self.param_dict = dict(self.model.named_parameters()) self.num_dense_layers = self.model.config.first_k_dense_replace self.num_moe_layers = self.model.config.num_hidden_layers - self.num_dense_layers - self.global_expert_num = 256 + self.global_expert_num = self.model.config.n_routed_experts # TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 is supported here self.expert_weight_names = ["w13_weight", "w2_weight", "w13_weight_scale", "w13_weight_offset", @@ -98,7 +98,7 @@ def do_update_expert_weight(self, layer_id, expert_id_before_replace, buffer_ten expert_tensor = self.param_dict[complete_name].data[local_expert_id] expert_tensor.copy_(self.buffer_tensor_dict[name][buffer_tensor_id]) - def generate_index_dicts(self,tensor_2d): + def generate_index_dicts(self, tensor_2d): dict_list = [] current_idx = 0 @@ -143,7 +143,7 @@ def do_update_log2phy_map(self, layer_id, updated_log2phy_map): rank_id = torch.distributed.get_rank() if self.log2phy_map_per_layer[layer_id] is not None: self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map[rank_id]) - + def global2local(self, placement: torch.Tensor, E_local: int diff --git a/vllm_ascend/eplb/core/policy/policy_factory.py b/vllm_ascend/eplb/core/policy/policy_factory.py index 0aa727bd99..8e94861a0d 100644 --- a/vllm_ascend/eplb/core/policy/policy_factory.py +++ b/vllm_ascend/eplb/core/policy/policy_factory.py @@ -10,6 +10,6 @@ class PolicyFactory: def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy: policy = { 0:MockLoadBalance , # MockLoadBalance - 1:DynamicEP, # When real eplb algorithm is ready, recover this + 1:DynamicEP, } return policy.get(policy_type, MockLoadBalance)(config) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index a2683cfa8a..d76389f143 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -128,7 +128,7 @@ def compose_expert_update_info(self, updated_expert_maps, current_expert_maps): if not torch.isin(torch.tensor(expert_id), experts_to_send).any(): # if expert_id are not sent out from any npu, it will be copied from one npu holding this expert - candidate_src_rank_indices = torch.where(current_expert_maps_this_layer[:, expert_id] != -1) + candidate_src_rank_indices = torch.where(current_expert_maps_this_layer[:, expert_id] != -1)[0] else: candidate_src_rank_indices = src_rank_indices[experts_to_send == expert_id] @@ -245,7 +245,7 @@ def __init__(self, shared_dict, planner_q, block_update_q, policy_type: int = 0, self.worker = EplbWorker(self.shared_dict, self.policy_type, self.enable_d2d) - def worker_process(self,planner_q,block_update_q): + def worker_process(self, planner_q, block_update_q): """ Subprocess entry: bind to specified NPU, loop waiting for planner_q to wake up, call do_update, then notify main process update is complete. """ @@ -254,14 +254,17 @@ def worker_process(self,planner_q,block_update_q): planner_q.get() - update_info = self.worker.do_update() + update_info_generator = self.worker.do_update() + update_info_list = [] - for (a,b,c,d) in update_info: - while True: - if not block_update_q.empty(): - continue - block_update_q.put((a,b,c,d)) - break + for (send_info , recv_info , new_expert_map, layer_id) in update_info_generator: + update_info_list.append((send_info , recv_info , new_expert_map, layer_id)) + + while True: + if not block_update_q.empty(): + continue + block_update_q.put(update_info_list) + break except Exception as e: logger.warning(f"[EPLB subprocess Exiting due to error: {e}", exc_info=True) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 217da2f0a9..d4faf05ff6 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -41,6 +41,7 @@ def init_eplb(self): self.update_in_flight = False self.reqs = [] + self.update_info_all = [] self.cur_iterations: torch.int64 = 0 @@ -88,8 +89,12 @@ def wakeup_eplb_worker(self): def forward_before(self): self.get_init_expert_map() + # Batch after eplb process being triggered, get update info provided by eplb process + if self.update_in_flight and self.weight_update_counter == 0: + self.update_info_all = self.block_update_queue.get() + if self.update_in_flight and self.weight_update_counter < self.num_moe_layers: - (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.block_update_queue.get() + (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.update_info_all.pop(0) rank_id = torch.distributed.get_rank() self.eplb_loader.generate_log2phy_map(updated_expert_map) expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] @@ -101,6 +106,7 @@ def forward_before(self): if self.weight_update_counter == self.num_moe_layers: self.weight_update_counter = 0 self.update_in_flight = False + self.update_info_all = [] # set asynchronous stream for d2d expert weight update self.reqs = [] From 96fa7ff63b6866d1a13f821d7005eebbb1914183 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Mon, 16 Jun 2025 23:09:53 +0800 Subject: [PATCH 070/153] [DP][V1] Fix rank set in DP scenario & Bump torch-npu version to 2.5.1.post1.dev20250528 (#1235) ### What this PR does / why we need it? 1. Fix rank set in DP scenario. The new poc version of torch-npu support setting `ASCEND_RT_VISIBLE_DEVICES` dynamically, thus we could use the rank set in `DPEngineCoreProc` directly instead of calculating local rank across dp by hand in the patched `_init_data_parallel` Closes: https://github.com/vllm-project/vllm-ascend/issues/1170 2. Bump torch-npu version to 2.5.1.post1.dev20250528 Closes: https://github.com/vllm-project/vllm-ascend/pull/1242 Closes: https://github.com/vllm-project/vllm-ascend/issues/1232 ### How was this patch tested? CI passed with new added test. --------- Signed-off-by: MengqingCao Signed-off-by: Icey <1790571317@qq.com> Co-authored-by: Icey <1790571317@qq.com> --- .github/workflows/accuracy_test.yaml | 2 + .github/workflows/image_openeuler.yml | 6 ++ .github/workflows/image_ubuntu.yml | 6 ++ .github/workflows/nightly_benchmarks.yaml | 2 + .github/workflows/vllm_ascend_test.yaml | 5 ++ .../workflows/vllm_ascend_test_long_term.yaml | 2 + .github/workflows/vllm_ascend_test_pd.yaml | 2 + Dockerfile | 3 +- Dockerfile.openEuler | 3 +- README.md | 2 +- README.zh.md | 2 +- docs/source/installation.md | 11 ++-- pyproject.toml | 2 +- requirements.txt | 6 +- setup.py | 2 +- tests/multicard/test_data_parallel.py | 66 +++++++++++++++++++ vllm_ascend/patch/__init__.py | 13 +--- .../patch_common/patch_distributed.py | 18 +---- vllm_ascend/worker/worker_v1.py | 8 +-- 19 files changed, 114 insertions(+), 47 deletions(-) create mode 100644 tests/multicard/test_data_parallel.py diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index 999fb6ad5d..0297f625d0 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -173,6 +173,8 @@ jobs: - name: Install vllm-project/vllm-ascend working-directory: ./vllm-ascend + env: + PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | pip install -r requirements-dev.txt pip install -e . diff --git a/.github/workflows/image_openeuler.yml b/.github/workflows/image_openeuler.yml index 0d585e690f..690d8145cd 100644 --- a/.github/workflows/image_openeuler.yml +++ b/.github/workflows/image_openeuler.yml @@ -19,6 +19,12 @@ on: - '.github/workflows/image_openeuler.yml' - 'Dockerfile.openEuler' - 'vllm_ascend/**' + - 'setup.py' + - 'pyproject.toml' + - 'requirements.txt' + - 'cmake/**' + - 'CMakeLists.txt' + - 'csrc/**' push: # Publish image when tagging, the Dockerfile in tag will be build as tag image branches: diff --git a/.github/workflows/image_ubuntu.yml b/.github/workflows/image_ubuntu.yml index 63dadc64a7..a2cfbcefb0 100644 --- a/.github/workflows/image_ubuntu.yml +++ b/.github/workflows/image_ubuntu.yml @@ -19,6 +19,12 @@ on: - '.github/workflows/image_ubuntu.yml' - 'Dockerfile' - 'vllm_ascend/**' + - 'setup.py' + - 'pyproject.toml' + - 'requirements.txt' + - 'cmake/**' + - 'CMakeLists.txt' + - 'csrc/**' push: # Publish image when tagging, the Dockerfile in tag will be build as tag image branches: diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index b060076c9c..6e03c1e6be 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -115,6 +115,8 @@ jobs: VLLM_TARGET_DEVICE=empty pip install -e . - name: Install vllm-project/vllm-ascend + env: + PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | pip install -e . pip install -r benchmarks/requirements-bench.txt diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 62c91de8cc..540680dd2f 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -151,6 +151,7 @@ jobs: - name: Install vllm-project/vllm-ascend run: | + export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/ python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/ @@ -214,6 +215,8 @@ jobs: VLLM_TARGET_DEVICE=empty pip install -e . - name: Install vllm-project/vllm-ascend + env: + PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | pip install -r requirements-dev.txt pip install -v -e . @@ -311,6 +314,8 @@ jobs: VLLM_TARGET_DEVICE=empty pip install -e . - name: Install vllm-project/vllm-ascend + env: + PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | pip install -r requirements-dev.txt pip install -v -e . diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index 46123590ab..e249849e19 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -88,6 +88,8 @@ jobs: VLLM_TARGET_DEVICE=empty pip install -e . - name: Install vllm-project/vllm-ascend + env: + PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | pip install -r requirements-dev.txt pip install -v -e . diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml index c2c76c9f23..932b3e59b3 100644 --- a/.github/workflows/vllm_ascend_test_pd.yaml +++ b/.github/workflows/vllm_ascend_test_pd.yaml @@ -97,6 +97,8 @@ jobs: VLLM_TARGET_DEVICE=empty pip install -e . - name: Install vllm-project/vllm-ascend + env: + PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi run: | pip install -r requirements-dev.txt pip install -v -e . diff --git a/Dockerfile b/Dockerfile index 952e77fe92..e6ec009d28 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,7 +46,8 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm # Install vllm-ascend # Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH -RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ source /usr/local/Ascend/nnal/atb/set_env.sh && \ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 2ff3d0b39a..4e414e04b7 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -43,7 +43,8 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ - python3 -m pip cache purge # Install vllm-ascend -RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ source /usr/local/Ascend/nnal/atb/set_env.sh && \ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/README.md b/README.md index f3f9296c1f..7d0966c8d4 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l - Software: * Python >= 3.9, < 3.12 * CANN >= 8.1.RC1 - * PyTorch >= 2.5.1, torch-npu >= 2.5.1 + * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1.dev20250528 * vLLM (the same version as vllm-ascend) ## Getting Started diff --git a/README.zh.md b/README.zh.md index 6adb0e1832..2d2062a8b4 100644 --- a/README.zh.md +++ b/README.zh.md @@ -39,7 +39,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP - 软件: * Python >= 3.9, < 3.12 * CANN >= 8.1.RC1 - * PyTorch >= 2.5.1, torch-npu >= 2.5.1 + * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1.dev20250528 * vLLM (与vllm-ascend版本一致) ## 开始使用 diff --git a/docs/source/installation.md b/docs/source/installation.md index c9f6844915..79e4a88dba 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -9,11 +9,11 @@ This document describes how to install vllm-ascend manually. - A hardware with Ascend NPU. It's usually the Atlas 800 A2 series. - Software: - | Software | Supported version | Note | - |-----------|-------------------|----------------------------------------| - | CANN | >= 8.1.RC1 | Required for vllm-ascend and torch-npu | - | torch-npu | >= 2.5.1 | Required for vllm-ascend | - | torch | >= 2.5.1 | Required for torch-npu and vllm | + | Software | Supported version | Note | + |---------------|----------------------------------|-------------------------------------------| + | CANN | >= 8.1.RC1 | Required for vllm-ascend and torch-npu | + | torch-npu | >= 2.5.1.post1.dev20250528 | Required for vllm-ascend | + | torch | >= 2.5.1 | Required for torch-npu and vllm | You have 2 way to install: - **Using pip**: first prepare env manually or via CANN image, then install `vllm-ascend` using pip. @@ -156,6 +156,7 @@ cd .. # Install vLLM Ascend git clone --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-project/vllm-ascend.git cd vllm-ascend +export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi pip install -v -e . cd .. ``` diff --git a/pyproject.toml b/pyproject.toml index b44197038b..514b755c32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ requires = [ "scipy", "setuptools>=64", "setuptools-scm>=8", - "torch-npu==2.5.1", + "torch-npu==2.5.1.post1.dev20250528", "torch>=2.5.1", "torchvision<0.21.0", "wheel", diff --git a/requirements.txt b/requirements.txt index 2f844df74a..eadb96f1e9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,6 @@ pyyaml scipy setuptools>=64 setuptools-scm>=8 -torch-npu==2.5.1 torch>=2.5.1 torchvision<0.21.0 wheel @@ -21,3 +20,8 @@ quart # Required for N-gram speculative decoding numba + +# Install torch_npu +--pre +--extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi +torch-npu==2.5.1.post1.dev20250528 diff --git a/setup.py b/setup.py index 631e55c1b6..327c2f8791 100644 --- a/setup.py +++ b/setup.py @@ -152,7 +152,7 @@ def configure(self, ext: CMakeExtension) -> None: # if pybind11 is installed via pip pybind11_cmake_path = (subprocess.check_output( [python_executable, "-m", "pybind11", - "--cmake"]).decode().strip()) + "--cmakedir"]).decode().strip()) except subprocess.CalledProcessError as e: # else specify pybind11 path installed from source code on CI container raise RuntimeError(f"CMake configuration failed: {e}") diff --git a/tests/multicard/test_data_parallel.py b/tests/multicard/test_data_parallel.py new file mode 100644 index 0000000000..6c0a20de97 --- /dev/null +++ b/tests/multicard/test_data_parallel.py @@ -0,0 +1,66 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Compare the outputs of vLLM with and without aclgraph. + +Run `pytest tests/multicard/test_data_parallel.py`. +""" + +import os + +import pytest + +from tests.conftest import VllmRunner +from tests.model_utils import check_outputs_equal + +MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"] + + +@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", + reason="Data parallel only support on v1") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [32]) +def test_data_parallel_correctness( + model: str, + max_tokens: int, +) -> None: + example_prompts = [ + "Hello, my name is", "The president of the United States is", + "The capital of France is", "The future of AI is" + ] + + with VllmRunner(model_name=model, + max_model_len=1024, + max_num_seqs=16, + data_parallel_size=2, + distributed_executor_backend="mp") as vllm_model: + vllm_dp_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) + + with VllmRunner( + model_name=model, + max_model_len=1024, + max_num_seqs=16, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=vllm_outputs, + outputs_1_lst=vllm_dp_outputs, + name_0="vllm_outputs", + name_1="vllm_dp_outputs", + ) diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 3c24bfc70f..d817f9063e 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -47,16 +47,7 @@ # Related PR (if no, explain why): # Future Plan: # Remove those patch when vllm merged them -# 2. `vllm.v1.engine.core.DPEngineCoreProc._init_data_parallel` -# Why: -# There is some bug for ASCEND_RT_VISIBLE_DEVICES usage. -# How: -# The ASCEND_RT_VISIBLE_DEVICES related code is dropped. -# Related PR (if no, explain why): -# No, this is a bug for vllm ascend -# Future Plan: -# Remove this patch once ASCEND_RT_VISIBLE_DEVICES bug is fixed. -# 3. `vllm.config.ParallelConfig.get_next_dp_init_port` +# 2. `vllm.config.ParallelConfig.get_next_dp_init_port` # Why: # vllm doesn't support get port from environment. # How: @@ -65,7 +56,7 @@ # Need a PR to vllm to support get port from environment. # Future Plan: # Remove those patch when vllm merged them -# 4. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group` +# 3. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group` # Why: # vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to # get better performance diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_common/patch_distributed.py index 21f846d319..86515df86e 100644 --- a/vllm_ascend/patch/platform/patch_common/patch_distributed.py +++ b/vllm_ascend/patch/platform/patch_common/patch_distributed.py @@ -21,10 +21,9 @@ import vllm.distributed import vllm.envs as envs from torch.distributed import ProcessGroup -from vllm.config import ParallelConfig, VllmConfig +from vllm.config import ParallelConfig from vllm.distributed.utils import \ stateless_init_torch_distributed_process_group -from vllm.v1.engine.core import DPEngineCoreProc def ascend_destroy_model_parallel(): @@ -79,21 +78,6 @@ def stateless_init_dp_group(self) -> "ProcessGroup": return dp_group -def _init_data_parallel(self, vllm_config: VllmConfig): - # Configure NPUs and stateless process group for data parallel. - dp_rank = vllm_config.parallel_config.data_parallel_rank - dp_size = vllm_config.parallel_config.data_parallel_size - local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local - - assert dp_size > 1 - assert 0 <= local_dp_rank <= dp_rank < dp_size - - self.local_dp_rank = local_dp_rank - self.dp_group = vllm_config.parallel_config.stateless_init_dp_group() - self.current_wave = 0 - - vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel -DPEngineCoreProc._init_data_parallel = _init_data_parallel ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port ParallelConfig.stateless_init_dp_group = stateless_init_dp_group diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index ebdf01e155..76844c96a4 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -75,12 +75,6 @@ def __init__( distributed_init_method=distributed_init_method, is_driver_worker=is_driver_worker) - # NOTE(Yizhou): Since we do not set ASCEND_RT_VISIBLE_DEVICES in - # vllm_ascend, we need to set the device id manually. - local_dp_rank = self.vllm_config.parallel_config.data_parallel_rank_local - world_size = self.vllm_config.parallel_config.world_size - self.local_rank_across_dp = local_dp_rank * world_size + self.local_rank - # Try to import mindie_turbo to accelerate vLLM inference. try_register_lib( "mindie_turbo", @@ -124,7 +118,7 @@ def initialize_cache(self, num_gpu_blocks: int, def init_device(self): if self.device_config.device.type == "npu": - self.device = torch.device(f"npu:{self.local_rank_across_dp}") + self.device = torch.device(f"npu:{self.local_rank}") NPUPlatform.set_device(self.device) NPUPlatform.empty_cache() self.init_npu_memory = NPUPlatform.mem_get_info()[0] From 9d3cbc09538481ae2a3b10bb376aafe5c34e1af9 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Tue, 17 Jun 2025 08:52:26 +0800 Subject: [PATCH 071/153] [Doctest] add installation doctest (#1179) ### What this PR does / why we need it? Install doctest ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed Related: https://github.com/vllm-project/vllm-ascend/pull/983 Co-authored-by: wangli Signed-off-by: Yikun Jiang Co-authored-by: wangli --- .github/workflows/vllm_ascend_doctest.yaml | 18 ++++---- docs/source/conf.py | 5 +++ docs/source/installation.md | 3 +- tests/e2e/common.sh | 25 ++++++++++- tests/e2e/doctests/001-quickstart-test.sh | 2 - .../002-pip-binary-installation-test.sh | 42 +++++++++++++++++++ tests/e2e/run_doctests.sh | 6 +++ 7 files changed, 89 insertions(+), 12 deletions(-) create mode 100644 tests/e2e/doctests/002-pip-binary-installation-test.sh diff --git a/.github/workflows/vllm_ascend_doctest.yaml b/.github/workflows/vllm_ascend_doctest.yaml index 63ba746a12..67f98fbaf7 100644 --- a/.github/workflows/vllm_ascend_doctest.yaml +++ b/.github/workflows/vllm_ascend_doctest.yaml @@ -46,7 +46,7 @@ jobs: # Each version should be tested fail-fast: false matrix: - vllm_verison: [main, v0.7.3-dev, main-openeuler, v0.7.3-dev-openeuler] + vllm_verison: [v0.9.1-dev, v0.9.1-dev-openeuler, main, main-openeuler] name: vLLM Ascend test runs-on: linux-arm64-npu-1 container: @@ -70,13 +70,13 @@ jobs: run: | sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list apt-get update -y - apt install git curl -y + apt install -y gcc g++ libnuma-dev git curl jq - name: Config OS mirrors - openEuler if: ${{ endsWith(matrix.vllm_verison, '-openeuler') }} run: | yum update -y - yum install git curl -y + yum install -y gcc g++ numactl-devel git curl jq - name: Config pip mirrors run: | @@ -88,11 +88,13 @@ jobs: - name: Run vllm-ascend/tests/e2e/run_doctests.sh run: | # PWD: /__w/vllm-ascend/vllm-ascend - # Address old branch like v0.7.3: - if [ ! -d /vllm-workspace/vllm-ascend/tests/e2e ]; then - echo "Warning: the doctest path doesn't exists, copy now" - cp -r tests/e2e /vllm-workspace/vllm-ascend/tests/ - fi + echo "Replacing /vllm-workspace/vllm-ascend/tests/e2e ..." + rm -rf /vllm-workspace/vllm-ascend/tests/e2e + mkdir -p /vllm-workspace/vllm-ascend/tests + cp -r tests/e2e /vllm-workspace/vllm-ascend/tests/ + + # TODO(yikun): Remove this after conf.py merged + cp docs/source/conf.py /vllm-workspace/vllm-ascend/docs/source/ # Simulate container to enter directory cd /workspace diff --git a/docs/source/conf.py b/docs/source/conf.py index 4b8eb0eaaa..63fb5dbe46 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,6 +23,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # +import json import os # import sys @@ -133,3 +134,7 @@ def setup(app): pass + + +if __name__ == "__main__": + print(json.dumps(myst_substitutions)) diff --git a/docs/source/installation.md b/docs/source/installation.md index 79e4a88dba..c290f7e5f7 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -246,7 +246,8 @@ for output in outputs: Then run: ```bash -# export VLLM_USE_MODELSCOPE=true to speed up download if huggingface is not reachable. +# Try `export VLLM_USE_MODELSCOPE=true` and `pip install modelscope` +# to speed up download if huggingface is not reachable. python example.py ``` diff --git a/tests/e2e/common.sh b/tests/e2e/common.sh index a8b8810108..3c61524925 100644 --- a/tests/e2e/common.sh +++ b/tests/e2e/common.sh @@ -45,7 +45,30 @@ function wait_for_exit() { _info "===> Wait for ${VLLM_PID} to exit." sleep 1 done - _info "===> Wait for ${VLLM_PID} to exit." + _info "===> Process ${VLLM_PID} has exited." +} + +VENV_PATH=/tmp/vllm_venv + +function clean_venv() { + if [[ -n "$VENV_PATH" && -d "$VENV_PATH" ]]; then + _info "Cleaning up default virtual env path: ${VENV_PATH}" + deactivate || true + rm -rf "$VENV_PATH" + fi +} + +function create_vllm_venv() { + # make a clean env path + clean_venv + _info "Creating vllm virtual environment at ${VENV_PATH}" + python3 -m venv ${VENV_PATH} + source ${VENV_PATH}/bin/activate +} + +function get_version() { + local VERSION_NAME="$1" + python3 "${SCRIPT_DIR}/../../docs/source/conf.py" | jq .${VERSION_NAME} | tr -d '"' } SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) diff --git a/tests/e2e/doctests/001-quickstart-test.sh b/tests/e2e/doctests/001-quickstart-test.sh index 0e59255cb9..44dce0d7bc 100755 --- a/tests/e2e/doctests/001-quickstart-test.sh +++ b/tests/e2e/doctests/001-quickstart-test.sh @@ -23,13 +23,11 @@ function simple_test() { } function quickstart_offline_test() { - export VLLM_USE_MODELSCOPE=true # Do real script test python3 "${SCRIPT_DIR}/../../examples/offline_inference_npu.py" } function quickstart_online_test() { - export VLLM_USE_MODELSCOPE=true vllm serve Qwen/Qwen2.5-0.5B-Instruct & wait_url_ready "vllm serve" "localhost:8000/v1/models" # Do real curl test diff --git a/tests/e2e/doctests/002-pip-binary-installation-test.sh b/tests/e2e/doctests/002-pip-binary-installation-test.sh new file mode 100644 index 0000000000..48a33bf6ff --- /dev/null +++ b/tests/e2e/doctests/002-pip-binary-installation-test.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +trap clean_venv EXIT + +function install_binary_test() { + + create_vllm_venv + + PIP_VLLM_VERSION=$(get_version pip_vllm_version) + PIP_VLLM_ASCEND_VERSION=$(get_version pip_vllm_ascend_version) + _info "====> Install vllm==${PIP_VLLM_VERSION} and vllm-ascend ${PIP_VLLM_ASCEND_VERSION}" + + pip install vllm=="$(get_version pip_vllm_version)" + pip install vllm-ascend=="$(get_version pip_vllm_ascend_version)" + + pip list | grep vllm + + # Verify the installation + _info "====> Run offline example test" + pip install modelscope + python3 "${SCRIPT_DIR}/../../examples/offline_inference_npu.py" + +} + +_info "====> Start install_binary_test" +install_binary_test diff --git a/tests/e2e/run_doctests.sh b/tests/e2e/run_doctests.sh index ee6613ad5d..2b00b6466c 100755 --- a/tests/e2e/run_doctests.sh +++ b/tests/e2e/run_doctests.sh @@ -21,7 +21,13 @@ set -eo errexit . $(dirname "$0")/common.sh +export VLLM_USE_MODELSCOPE=true +export VLLM_LOGGING_LEVEL=ERROR + _info "====> Start Quickstart test" . "${SCRIPT_DIR}/doctests/001-quickstart-test.sh" +_info "====> Start pip binary installation test" +. "${SCRIPT_DIR}/doctests/002-pip-binary-installation-test.sh" + _info "Doctest passed." From 05dec7eda96fd284a9505aee3905b1a11dd7ee9a Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Tue, 17 Jun 2025 09:36:35 +0800 Subject: [PATCH 072/153] [Doc] Refactor and init user story page (#1224) ### What this PR does / why we need it? This PR refactor the user stories page: - Move it to community - Add initial info of LLaMA-Factory, Huggingface/trl, MindIE Turbo, GPUStack, verl - Add a new page for LLaMA-Factory ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Preview locally Signed-off-by: Yikun Jiang --- docs/source/community/user_stories/index.md | 19 ++++++++++++++++ .../community/user_stories/llamafactory.md | 19 ++++++++++++++++ docs/source/index.md | 8 +------ docs/source/user_stories/example.md | 15 ------------- docs/source/user_stories/index.md | 22 ------------------- 5 files changed, 39 insertions(+), 44 deletions(-) create mode 100644 docs/source/community/user_stories/index.md create mode 100644 docs/source/community/user_stories/llamafactory.md delete mode 100644 docs/source/user_stories/example.md delete mode 100644 docs/source/user_stories/index.md diff --git a/docs/source/community/user_stories/index.md b/docs/source/community/user_stories/index.md new file mode 100644 index 0000000000..1dc1e56e76 --- /dev/null +++ b/docs/source/community/user_stories/index.md @@ -0,0 +1,19 @@ +# User Stories + +Read case studies on how users and developers solves real, everyday problems with vLLM Ascend + +- [LLaMA-Factory](./llamafactory.md) is an easy-to-use and efficient platform for training and fine-tuning large language models, it supports vLLM Ascend to speed up inference since [LLaMA-Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739), gain 2x performance enhancement of inference. + +- [Huggingface/trl](https://github.com/huggingface/trl) is a cutting-edge library designed for post-training foundation models using advanced techniques like SFT, PPO and DPO, it uses vLLM Ascend since [v0.17.0](https://github.com/huggingface/trl/releases/tag/v0.17.0) to support RLHF on Ascend NPU. + +- [MindIE Turbo](https://pypi.org/project/mindie-turbo) is an LLM inference engine acceleration plug-in library developed by Huawei on Ascend hardware, which includes self-developed large language model optimization algorithms and optimizations related to the inference engine framework. It supports vLLM Ascend since [2.0rc1](https://www.hiascend.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev/mindie-turbo-0001.html). + +- [GPUStack](https://github.com/gpustack/gpustack) is an open-source GPU cluster manager for running AI models. It supports vLLM Ascend since [v0.6.2](https://github.com/gpustack/gpustack/releases/tag/v0.6.2), see more GPUStack performance evaluation info on [link](https://mp.weixin.qq.com/s/pkytJVjcH9_OnffnsFGaew). + +- [verl](https://github.com/volcengine/verl) is a flexible, efficient and production-ready RL training library for large language models (LLMs), uses vLLM Ascend since [v0.4.0](https://github.com/volcengine/verl/releases/tag/v0.4.0), see more info on [verl x Ascend Quickstart](https://verl.readthedocs.io/en/latest/ascend_tutorial/ascend_quick_start.html). + +:::{toctree} +:caption: More details +:maxdepth: 1 +llamafactory +::: diff --git a/docs/source/community/user_stories/llamafactory.md b/docs/source/community/user_stories/llamafactory.md new file mode 100644 index 0000000000..bb95990eb0 --- /dev/null +++ b/docs/source/community/user_stories/llamafactory.md @@ -0,0 +1,19 @@ +# LLaMA-Factory + +**About / Introduction** + +[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) is an easy-to-use and efficient platform for training and fine-tuning large language models. With LLaMA-Factory, you can fine-tune hundreds of pre-trained models locally without writing any code. + +LLaMA-Facotory users need to evaluate and inference the model after fine-tuning the model. + +**The Business Challenge** + +LLaMA-Factory used transformers to perform inference on Ascend NPU, but the speed was slow. + +**Solving Challenges and Benefits with vLLM Ascend** + +With the joint efforts of LLaMA-Factory and vLLM Ascend ([LLaMA-Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739)), the performance of LLaMA-Factory in the model inference stage has been significantly improved. According to the test results, the inference speed of LLaMA-Factory has been increased to 2x compared to the transformers version. + +**Learn more** + +See more about LLaMA-Factory and how it uses vLLM Ascend for inference on the Ascend NPU in the following documentation: [LLaMA-Factory Ascend NPU Inference](https://llamafactory.readthedocs.io/en/latest/advanced/npu_inference.html). diff --git a/docs/source/index.md b/docs/source/index.md index 5f421b4679..e9b103b2c3 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -66,11 +66,5 @@ developer_guide/evaluation/index :maxdepth: 1 community/governance community/contributors -::: - -% User stories about vLLM Ascend project -:::{toctree} -:caption: User Story -:maxdepth: 1 -user_stories/index +community/user_stories/index ::: diff --git a/docs/source/user_stories/example.md b/docs/source/user_stories/example.md deleted file mode 100644 index ebbcf56d7b..0000000000 --- a/docs/source/user_stories/example.md +++ /dev/null @@ -1,15 +0,0 @@ -# xxx project uses Ascend vLLM, gain 200% performance enhancement of inference. - -## About / Introduction -Draft content - -## The Business Challenge -Our goal is to ... - -## Solving challenges with vLLM Ascend -vLLM Ascend helped us ... - -## Benefits using vLLM Ascend - -## Learn more -more info about this case diff --git a/docs/source/user_stories/index.md b/docs/source/user_stories/index.md deleted file mode 100644 index 6de8b21577..0000000000 --- a/docs/source/user_stories/index.md +++ /dev/null @@ -1,22 +0,0 @@ -# vLLM Ascend User Stories - -Read case studies on how users and developers solves real, everyday problems with vLLM Ascend - -:::{card} Example user story -:link: ./example -:link-type: doc - -xxx project uses Ascend vLLM, gain 200% performance enhancement of inference. - -+++ - -**Tags**: vLLM, Ascend, Inference - -::: - -:::{toctree} -:caption: Deployment -:maxdepth: 1 -:hidden: -example -::: From ae01e08b9bc81e45563a4d77f24cd773907174bb Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Tue, 17 Jun 2025 15:37:49 +0800 Subject: [PATCH 073/153] add compose_expert_update_info_bipartite --- vllm_ascend/eplb/core/worker/eplb_worker.py | 86 +++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index a2683cfa8a..5780b98c7a 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -227,6 +227,92 @@ def local2global(self, return placement_global +def compose_expert_update_info_bipartite(updated_expert_maps, + current_expert_maps): + num_layers = current_expert_maps.shape[0] + num_ranks = current_expert_maps.shape[1] + num_experts = current_expert_maps.shape[2] + + for layer_id in range(num_layers): + updated_expert_maps_this_layer = updated_expert_maps[layer_id] + current_expert_maps_this_layer = current_expert_maps[layer_id] + + expert_send_info_this_layer = dict() + expert_recv_info_this_layer = dict() + + # Guard Clause: if there is no expert weight update, avoid subsequent processing + if (np.equal(updated_expert_maps_this_layer, + current_expert_maps_this_layer)).all(): + yield (expert_send_info_this_layer, expert_recv_info_this_layer, + updated_expert_maps_this_layer, layer_id) + + # Parse expert_ids each rank needs to receive from other ranks + dst_rank_indices, experts_to_recv = np.where((current_expert_maps_this_layer == -1) + & (updated_expert_maps_this_layer != -1)) + + # record src ranks for potential transfer + src_ranks_set = dict() + for idx in range(len(dst_rank_indices)): + expert_id = experts_to_recv[idx].item() + if expert_id not in src_ranks_set: + src_ranks_set[expert_id] = np.where( + current_expert_maps_this_layer[:, expert_id] != -1)[0] + + # loop until all experts are scheduled + while len(dst_rank_indices) > 0: + # construct bipartite graph + G = nx.Graph() + for idx in range(len(dst_rank_indices)): + dst_rank_id = dst_rank_indices[idx].item() + expert_id = experts_to_recv[idx].item() + # add src ranks + src_rank_ids = src_ranks_set[expert_id] + G.add_nodes_from(src_rank_ids, bipartite=0) + # add dest rank + G.add_nodes_from(str(dst_rank_id), bipartite=1) + # add edges + for src_rank_id in src_rank_ids: + G.add_edge(src_rank_id, str(dst_rank_id)) + + # graph may not be connected + connected_components = list(nx.connected_components(G)) + all_matches = {} + # matching in this loop + for i, component in enumerate(connected_components): + subgraph = G.subgraph(component) + component_matching = nx.bipartite.maximum_matching(subgraph) + all_matches.update(component_matching) + + for u, v in all_matches.items(): + v = int(v) + assert u != v + if G.nodes[u]['bipartite'] == 0: + # currently not scheduled experts in rank v + experts_v = experts_to_recv[np.where( + dst_rank_indices == v)] + # src: u, dest: v, expert: expert_id + expert_id = np.intersect1d(experts_v, np.where( + current_expert_maps_this_layer[u] != -1))[0] + + # record send/rcv pairs + if u not in expert_send_info_this_layer: + expert_send_info_this_layer[u] = [] + if v not in expert_recv_info_this_layer: + expert_recv_info_this_layer[v] = [] + expert_send_info_this_layer[u].append((v, expert_id)) + expert_recv_info_this_layer[v].append((u, expert_id)) + + remove_index = np.where(np.logical_and( + dst_rank_indices == v, experts_to_recv == expert_id)) + + # update + dst_rank_indices = np.delete( + dst_rank_indices, remove_index) + experts_to_recv = np.delete(experts_to_recv, remove_index) + + yield expert_send_info_this_layer, expert_recv_info_this_layer, updated_expert_maps_this_layer, layer_id + + class EplbProcess: def __init__(self, shared_dict, planner_q, block_update_q, policy_type: int = 0, enable_d2d: bool = True): """ From 4b5cd842628c912ce35a5d9faf4f464fa9726637 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Tue, 17 Jun 2025 16:06:37 +0800 Subject: [PATCH 074/153] adapt compose_expert_update_info_bipartite into eplb process --- .../eplb/core/policy/policy_factory.py | 5 +- vllm_ascend/eplb/core/worker/eplb_worker.py | 186 +++++++++--------- 2 files changed, 100 insertions(+), 91 deletions(-) diff --git a/vllm_ascend/eplb/core/policy/policy_factory.py b/vllm_ascend/eplb/core/policy/policy_factory.py index 0aa727bd99..4650f832a7 100644 --- a/vllm_ascend/eplb/core/policy/policy_factory.py +++ b/vllm_ascend/eplb/core/policy/policy_factory.py @@ -9,7 +9,8 @@ class PolicyFactory: @staticmethod def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy: policy = { - 0:MockLoadBalance , # MockLoadBalance - 1:DynamicEP, # When real eplb algorithm is ready, recover this + 0:MockLoadBalance , # MockLoadBalance with greedy d2d expert weight composing + 1:DynamicEP, # Dynamic EPLB policy with greedy d2d expert weight composing + 2:DynamicEP, # Dynamic EPLB policy with bipartite d2d expert weight composing } return policy.get(policy_type, MockLoadBalance)(config) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 5780b98c7a..1f76b0588b 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -17,6 +17,7 @@ import time import numpy as np +import networkx as nx import torch import torch_npu import logging @@ -67,7 +68,9 @@ def do_update(self): logger.debug(f"[EPLB Process new_map differs, performing D2D") - update_info = self.compose_expert_update_info(new_expert_maps, self.old_expert_maps) + update_info = self.compose_expert_update_info_greedy(new_expert_maps, self.old_expert_maps)\ + if self.policy_type <= 1 else self.compose_expert_update_info_bipartite(new_expert_maps, self.old_expert_maps) + self.old_expert_maps = new_expert_maps logger.info("EPLB Process complete") @@ -95,8 +98,100 @@ def check_expert_placement(self, old_placement, new_placement): new_placement[layer_id] = old_placement[layer_id] break + def compose_expert_update_info_bipartite(self, updated_expert_maps_org, current_expert_maps_org): + # transform numpy array to torch tensor + updated_expert_maps = updated_expert_maps_org.clone() + current_expert_maps = current_expert_maps_org.clone() + updated_expert_maps = np.array(updated_expert_maps) + current_expert_maps = np.array(current_expert_maps) + + num_layers = current_expert_maps.shape[0] + num_ranks = current_expert_maps.shape[1] + num_experts = current_expert_maps.shape[2] + + for layer_id in range(num_layers): + updated_expert_maps_this_layer = updated_expert_maps[layer_id] + current_expert_maps_this_layer = current_expert_maps[layer_id] + updated_expert_maps_this_layer_org = updated_expert_maps_org[layer_id] + + expert_send_info_this_layer = dict() + expert_recv_info_this_layer = dict() + + # Guard Clause: if there is no expert weight update, avoid subsequent processing + if (np.equal(updated_expert_maps_this_layer, + current_expert_maps_this_layer)).all(): + yield (expert_send_info_this_layer, expert_recv_info_this_layer, + updated_expert_maps_this_layer_org, layer_id) + + # Parse expert_ids each rank needs to receive from other ranks + dst_rank_indices, experts_to_recv = np.where((current_expert_maps_this_layer == -1) + & (updated_expert_maps_this_layer != -1)) + + # record src ranks for potential transfer + src_ranks_set = dict() + for idx in range(len(dst_rank_indices)): + expert_id = experts_to_recv[idx].item() + if expert_id not in src_ranks_set: + src_ranks_set[expert_id] = np.where( + current_expert_maps_this_layer[:, expert_id] != -1)[0] + + # loop until all experts are scheduled + while len(dst_rank_indices) > 0: + # construct bipartite graph + graph_expert_update = nx.Graph() + for idx in range(len(dst_rank_indices)): + dst_rank_id = dst_rank_indices[idx].item() + expert_id = experts_to_recv[idx].item() + # add src ranks + src_rank_ids = src_ranks_set[expert_id] + graph_expert_update.add_nodes_from(src_rank_ids, bipartite=0) + # add dest rank + graph_expert_update.add_nodes_from(str(dst_rank_id), bipartite=1) + # add edges + for src_rank_id in src_rank_ids: + graph_expert_update.add_edge(src_rank_id, str(dst_rank_id)) + + # graph may not be connected + connected_components = list(nx.connected_components(graph_expert_update)) + all_matches = {} + # matching in this loop + for i, component in enumerate(connected_components): + subgraph = graph_expert_update.subgraph(component) + component_matching = nx.bipartite.maximum_matching(subgraph) + all_matches.update(component_matching) + + for src_rank, dst_rank in all_matches.items(): + dst_rank = int(dst_rank) + assert src_rank != dst_rank + if graph_expert_update.nodes[src_rank]['bipartite'] == 0: + # currently not scheduled experts in rank dst_rank + experts_v = experts_to_recv[np.where( + dst_rank_indices == dst_rank)] + # src: src_rank, dest: dst_rank, expert: expert_id + expert_id = np.intersect1d(experts_v, np.where( + current_expert_maps_this_layer[src_rank] != -1))[0] + + # record send/rcv pairs + if src_rank not in expert_send_info_this_layer: + expert_send_info_this_layer[src_rank] = [] + if dst_rank not in expert_recv_info_this_layer: + expert_recv_info_this_layer[dst_rank] = [] + expert_send_info_this_layer[src_rank].append((dst_rank, expert_id)) + expert_recv_info_this_layer[dst_rank].append((src_rank, expert_id)) + + remove_index = np.where(np.logical_and( + dst_rank_indices == dst_rank, experts_to_recv == expert_id)) + + # update + dst_rank_indices = np.delete( + dst_rank_indices, remove_index) + experts_to_recv = np.delete(experts_to_recv, remove_index) + + yield (expert_send_info_this_layer, expert_recv_info_this_layer, + updated_expert_maps_this_layer_org, layer_id) + # TODO: Here only expert weight exchange is considered, need to be extended to cover other weight update cases - def compose_expert_update_info(self, updated_expert_maps, current_expert_maps): + def compose_expert_update_info_greedy(self, updated_expert_maps, current_expert_maps): num_layers = current_expert_maps.shape[0] num_ranks = current_expert_maps.shape[1] num_experts = current_expert_maps.shape[2] @@ -226,93 +321,6 @@ def local2global(self, return placement_global - -def compose_expert_update_info_bipartite(updated_expert_maps, - current_expert_maps): - num_layers = current_expert_maps.shape[0] - num_ranks = current_expert_maps.shape[1] - num_experts = current_expert_maps.shape[2] - - for layer_id in range(num_layers): - updated_expert_maps_this_layer = updated_expert_maps[layer_id] - current_expert_maps_this_layer = current_expert_maps[layer_id] - - expert_send_info_this_layer = dict() - expert_recv_info_this_layer = dict() - - # Guard Clause: if there is no expert weight update, avoid subsequent processing - if (np.equal(updated_expert_maps_this_layer, - current_expert_maps_this_layer)).all(): - yield (expert_send_info_this_layer, expert_recv_info_this_layer, - updated_expert_maps_this_layer, layer_id) - - # Parse expert_ids each rank needs to receive from other ranks - dst_rank_indices, experts_to_recv = np.where((current_expert_maps_this_layer == -1) - & (updated_expert_maps_this_layer != -1)) - - # record src ranks for potential transfer - src_ranks_set = dict() - for idx in range(len(dst_rank_indices)): - expert_id = experts_to_recv[idx].item() - if expert_id not in src_ranks_set: - src_ranks_set[expert_id] = np.where( - current_expert_maps_this_layer[:, expert_id] != -1)[0] - - # loop until all experts are scheduled - while len(dst_rank_indices) > 0: - # construct bipartite graph - G = nx.Graph() - for idx in range(len(dst_rank_indices)): - dst_rank_id = dst_rank_indices[idx].item() - expert_id = experts_to_recv[idx].item() - # add src ranks - src_rank_ids = src_ranks_set[expert_id] - G.add_nodes_from(src_rank_ids, bipartite=0) - # add dest rank - G.add_nodes_from(str(dst_rank_id), bipartite=1) - # add edges - for src_rank_id in src_rank_ids: - G.add_edge(src_rank_id, str(dst_rank_id)) - - # graph may not be connected - connected_components = list(nx.connected_components(G)) - all_matches = {} - # matching in this loop - for i, component in enumerate(connected_components): - subgraph = G.subgraph(component) - component_matching = nx.bipartite.maximum_matching(subgraph) - all_matches.update(component_matching) - - for u, v in all_matches.items(): - v = int(v) - assert u != v - if G.nodes[u]['bipartite'] == 0: - # currently not scheduled experts in rank v - experts_v = experts_to_recv[np.where( - dst_rank_indices == v)] - # src: u, dest: v, expert: expert_id - expert_id = np.intersect1d(experts_v, np.where( - current_expert_maps_this_layer[u] != -1))[0] - - # record send/rcv pairs - if u not in expert_send_info_this_layer: - expert_send_info_this_layer[u] = [] - if v not in expert_recv_info_this_layer: - expert_recv_info_this_layer[v] = [] - expert_send_info_this_layer[u].append((v, expert_id)) - expert_recv_info_this_layer[v].append((u, expert_id)) - - remove_index = np.where(np.logical_and( - dst_rank_indices == v, experts_to_recv == expert_id)) - - # update - dst_rank_indices = np.delete( - dst_rank_indices, remove_index) - experts_to_recv = np.delete(experts_to_recv, remove_index) - - yield expert_send_info_this_layer, expert_recv_info_this_layer, updated_expert_maps_this_layer, layer_id - - class EplbProcess: def __init__(self, shared_dict, planner_q, block_update_q, policy_type: int = 0, enable_d2d: bool = True): """ From d5dc94633dd2d28daf850d184beaba239fc70f0b Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Mon, 16 Jun 2025 20:48:56 +0800 Subject: [PATCH 075/153] fix bugsw --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 50 ++++++++++++++++++------ 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index b56468a66c..4af0673d01 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -17,9 +17,12 @@ import torch import torch.distributed as dist +import numpy as np from vllm_ascend.eplb.adaptor.abstract_adaptor import EplbAdaptor from vllm.logger import logger +import random + class VllmEplbAdaptor(EplbAdaptor): @@ -29,6 +32,7 @@ def __init__(self, model, **args): self.param_dict = dict(self.model.named_parameters()) self.num_dense_layers = self.model.config.first_k_dense_replace self.num_moe_layers = self.model.config.num_hidden_layers - self.num_dense_layers + self.global_expert_num = 256 # TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 is supported here self.expert_weight_names = ["w13_weight", "w2_weight", "w13_weight_scale", "w13_weight_offset", @@ -72,18 +76,16 @@ def get_init_expert_map(self, num_moe_layers): expert_map = self.model.get_all_expert_map(num_moe_layers) if dist.is_initialized(): world_size = dist.get_world_size() + rank = dist.get_rank() - rank = dist.get_rank() - - tensor_list = [ - torch.zeros_like(expert_map) for _ in range(world_size) - ] + gathered = torch.empty((world_size, *expert_map.shape), # [W, L, E] + dtype=expert_map.dtype, + device=expert_map.device) - dist.all_gather(tensor_list, expert_map) - gathered = torch.stack(tensor_list, dim=0) - all_maps = gathered.permute(1, 0, 2).contiguous() + dist.all_gather_into_tensor(gathered, expert_map) + all_maps = gathered.permute(1, 0, 2) + all_expert_maps = all_maps.cpu() - all_expert_maps = all_maps.to(torch.device("cpu")) return all_expert_maps def do_update_expert_map(self, layer_id, updated_expert_map): @@ -111,6 +113,8 @@ def generate_index_dicts(self,tensor_2d): return dict_list def generate_log2phy_map(self, expert_map): + num_local_experts = expert_map.max() + 1 + expert_map = self.global2local(expert_map,num_local_experts) ranks_num, global_expert_num = expert_map.shape concatenated = torch.flatten(expert_map) rank_expert_to_global = self.generate_index_dicts( @@ -122,7 +126,7 @@ def generate_log2phy_map(self, expert_map): result_dict[key] = [] result_dict[key].append(idx) - log2phy_map = torch.full((ranks_num, global_expert_num), + log2phy_map = torch.full((ranks_num, self.global_expert_num), -1, dtype=torch.int32) for rank in range(ranks_num): @@ -136,7 +140,27 @@ def generate_log2phy_map(self, expert_map): return log2phy_map def do_update_log2phy_map(self, layer_id, updated_log2phy_map): - + rank_id = torch.distributed.get_rank() if self.log2phy_map_per_layer[layer_id] is not None: - rank_id = torch.distributed.get_rank() - self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map[rank_id]) \ No newline at end of file + self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map[rank_id]) + + def global2local(self, + placement: torch.Tensor, + E_local: int + ) -> tuple[torch.Tensor, torch.Tensor]: + + G, _ = placement.shape + device = placement.device + + pt_local = torch.full(( G, E_local), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement >= 0 + g_idx, k_idx = valid.nonzero(as_tuple=True) + slot_idx = placement[g_idx, k_idx] + + pt_local[g_idx, slot_idx] = k_idx + + return pt_local From 86be76f3d4df24485be8e0e2d2a91f72dbb059c1 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Mon, 16 Jun 2025 22:09:22 +0800 Subject: [PATCH 076/153] improve the implement of communication between main process and eplb process --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 6 +++--- .../eplb/core/policy/policy_factory.py | 2 +- vllm_ascend/eplb/core/worker/eplb_worker.py | 21 +++++++++++-------- vllm_ascend/eplb/eplb_updator.py | 8 ++++++- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 4af0673d01..d42482a37a 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -32,7 +32,7 @@ def __init__(self, model, **args): self.param_dict = dict(self.model.named_parameters()) self.num_dense_layers = self.model.config.first_k_dense_replace self.num_moe_layers = self.model.config.num_hidden_layers - self.num_dense_layers - self.global_expert_num = 256 + self.global_expert_num = self.model.config.n_routed_experts # TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 is supported here self.expert_weight_names = ["w13_weight", "w2_weight", "w13_weight_scale", "w13_weight_offset", @@ -98,7 +98,7 @@ def do_update_expert_weight(self, layer_id, expert_id_before_replace, buffer_ten expert_tensor = self.param_dict[complete_name].data[local_expert_id] expert_tensor.copy_(self.buffer_tensor_dict[name][buffer_tensor_id]) - def generate_index_dicts(self,tensor_2d): + def generate_index_dicts(self, tensor_2d): dict_list = [] current_idx = 0 @@ -143,7 +143,7 @@ def do_update_log2phy_map(self, layer_id, updated_log2phy_map): rank_id = torch.distributed.get_rank() if self.log2phy_map_per_layer[layer_id] is not None: self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map[rank_id]) - + def global2local(self, placement: torch.Tensor, E_local: int diff --git a/vllm_ascend/eplb/core/policy/policy_factory.py b/vllm_ascend/eplb/core/policy/policy_factory.py index 0aa727bd99..8e94861a0d 100644 --- a/vllm_ascend/eplb/core/policy/policy_factory.py +++ b/vllm_ascend/eplb/core/policy/policy_factory.py @@ -10,6 +10,6 @@ class PolicyFactory: def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy: policy = { 0:MockLoadBalance , # MockLoadBalance - 1:DynamicEP, # When real eplb algorithm is ready, recover this + 1:DynamicEP, } return policy.get(policy_type, MockLoadBalance)(config) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index a2683cfa8a..d76389f143 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -128,7 +128,7 @@ def compose_expert_update_info(self, updated_expert_maps, current_expert_maps): if not torch.isin(torch.tensor(expert_id), experts_to_send).any(): # if expert_id are not sent out from any npu, it will be copied from one npu holding this expert - candidate_src_rank_indices = torch.where(current_expert_maps_this_layer[:, expert_id] != -1) + candidate_src_rank_indices = torch.where(current_expert_maps_this_layer[:, expert_id] != -1)[0] else: candidate_src_rank_indices = src_rank_indices[experts_to_send == expert_id] @@ -245,7 +245,7 @@ def __init__(self, shared_dict, planner_q, block_update_q, policy_type: int = 0, self.worker = EplbWorker(self.shared_dict, self.policy_type, self.enable_d2d) - def worker_process(self,planner_q,block_update_q): + def worker_process(self, planner_q, block_update_q): """ Subprocess entry: bind to specified NPU, loop waiting for planner_q to wake up, call do_update, then notify main process update is complete. """ @@ -254,14 +254,17 @@ def worker_process(self,planner_q,block_update_q): planner_q.get() - update_info = self.worker.do_update() + update_info_generator = self.worker.do_update() + update_info_list = [] - for (a,b,c,d) in update_info: - while True: - if not block_update_q.empty(): - continue - block_update_q.put((a,b,c,d)) - break + for (send_info , recv_info , new_expert_map, layer_id) in update_info_generator: + update_info_list.append((send_info , recv_info , new_expert_map, layer_id)) + + while True: + if not block_update_q.empty(): + continue + block_update_q.put(update_info_list) + break except Exception as e: logger.warning(f"[EPLB subprocess Exiting due to error: {e}", exc_info=True) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 217da2f0a9..d4faf05ff6 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -41,6 +41,7 @@ def init_eplb(self): self.update_in_flight = False self.reqs = [] + self.update_info_all = [] self.cur_iterations: torch.int64 = 0 @@ -88,8 +89,12 @@ def wakeup_eplb_worker(self): def forward_before(self): self.get_init_expert_map() + # Batch after eplb process being triggered, get update info provided by eplb process + if self.update_in_flight and self.weight_update_counter == 0: + self.update_info_all = self.block_update_queue.get() + if self.update_in_flight and self.weight_update_counter < self.num_moe_layers: - (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.block_update_queue.get() + (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.update_info_all.pop(0) rank_id = torch.distributed.get_rank() self.eplb_loader.generate_log2phy_map(updated_expert_map) expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] @@ -101,6 +106,7 @@ def forward_before(self): if self.weight_update_counter == self.num_moe_layers: self.weight_update_counter = 0 self.update_in_flight = False + self.update_info_all = [] # set asynchronous stream for d2d expert weight update self.reqs = [] From 03abde3c62c5a35518f181f535274ad4d7db125f Mon Sep 17 00:00:00 2001 From: raindaywhu Date: Tue, 17 Jun 2025 17:38:36 +0800 Subject: [PATCH 077/153] move generate log2ph map to eplb_worker --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 61 -------------- .../core/loader/device_transfer_loader.py | 9 +- vllm_ascend/eplb/core/worker/eplb_worker.py | 9 +- vllm_ascend/eplb/eplb_updator.py | 16 ++-- vllm_ascend/eplb/tool/eplb_utils.py | 84 +++++++++++++++++++ vllm_ascend/worker/model_runner_v1.py | 2 +- 6 files changed, 105 insertions(+), 76 deletions(-) create mode 100644 vllm_ascend/eplb/tool/eplb_utils.py diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index d42482a37a..b9d798fbe6 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -98,69 +98,8 @@ def do_update_expert_weight(self, layer_id, expert_id_before_replace, buffer_ten expert_tensor = self.param_dict[complete_name].data[local_expert_id] expert_tensor.copy_(self.buffer_tensor_dict[name][buffer_tensor_id]) - def generate_index_dicts(self, tensor_2d): - dict_list = [] - current_idx = 0 - - for row in tensor_2d: - value_to_index = {} - for i in range(row.size(0)): - value = row[i].item() - value_to_index[value] = current_idx + i - dict_list.append(value_to_index) - current_idx += row.size(0) - - return dict_list - - def generate_log2phy_map(self, expert_map): - num_local_experts = expert_map.max() + 1 - expert_map = self.global2local(expert_map,num_local_experts) - ranks_num, global_expert_num = expert_map.shape - concatenated = torch.flatten(expert_map) - rank_expert_to_global = self.generate_index_dicts( - expert_map) - result_dict: Dict[int, List[int]] = {} - for idx, value in enumerate(concatenated): - key = value.item() - if key not in result_dict: - result_dict[key] = [] - result_dict[key].append(idx) - - log2phy_map = torch.full((ranks_num, self.global_expert_num), - -1, - dtype=torch.int32) - for rank in range(ranks_num): - for key in result_dict: - indices_in_concat = result_dict[key] - if key in rank_expert_to_global[rank]: - log2phy_map[rank][key] = rank_expert_to_global[rank][key] - else: - chosen_index = random.choice(indices_in_concat) - log2phy_map[rank][key] = chosen_index - return log2phy_map - def do_update_log2phy_map(self, layer_id, updated_log2phy_map): rank_id = torch.distributed.get_rank() if self.log2phy_map_per_layer[layer_id] is not None: self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map[rank_id]) - def global2local(self, - placement: torch.Tensor, - E_local: int - ) -> tuple[torch.Tensor, torch.Tensor]: - - G, _ = placement.shape - device = placement.device - - pt_local = torch.full(( G, E_local), - fill_value=-1, - dtype=torch.long, - device=device) - - valid = placement >= 0 - g_idx, k_idx = valid.nonzero(as_tuple=True) - slot_idx = placement[g_idx, k_idx] - - pt_local[g_idx, slot_idx] = k_idx - - return pt_local diff --git a/vllm_ascend/eplb/core/loader/device_transfer_loader.py b/vllm_ascend/eplb/core/loader/device_transfer_loader.py index d7afb47e5f..17bd47e141 100644 --- a/vllm_ascend/eplb/core/loader/device_transfer_loader.py +++ b/vllm_ascend/eplb/core/loader/device_transfer_loader.py @@ -69,8 +69,8 @@ def generate_expert_d2d_transfer_task(self, expert_send_info, expert_recv_info, self.state = ExpertWeightUpdateState.READY - def generate_log2phy_map(self,expert_map): - self.updated_log2phy_map = self.eplb_adaptor.generate_log2phy_map(expert_map) + def set_log2phy_map(self, log2phy_map): + self.updated_log2phy_map = log2phy_map def asyn_expert_weight_transfer(self, reqs): # Only when send/recv tasks are parsed into self.comm_op_list, d2d send/recv tasks can be luanched @@ -83,7 +83,7 @@ def asyn_expert_weight_transfer(self, reqs): self.state = ExpertWeightUpdateState.TRANSFERING - def update_expert_map_and_weight(self, reqs): + def update_expert_map_and_weight(self, reqs, redundant_enable): # Only after send/recv tasks have been luanched, expert_map and weight can be updated if self.state != ExpertWeightUpdateState.TRANSFERING: return @@ -99,7 +99,8 @@ def update_expert_map_and_weight(self, reqs): self.eplb_adaptor.do_update_expert_map(self.layer_id, self.updated_expert_map) #update log2phy_map - self.eplb_adaptor.do_update_log2phy_map(self.layer_id, self.updated_log2phy_map) + if redundant_enable: + self.eplb_adaptor.do_update_log2phy_map(self.layer_id, self.updated_log2phy_map) # update expert weight buffer_tensor_id = 0 diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index d76389f143..7fe01024f5 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -26,6 +26,7 @@ from vllm.logger import logger from vllm_ascend.eplb.core.policy.policy_factory import PolicyFactory, DynamicConfig +from vllm_ascend.eplb.tool.eplb_utils import ExpertMapUtils class EplbWorker: @@ -69,7 +70,7 @@ def do_update(self): update_info = self.compose_expert_update_info(new_expert_maps, self.old_expert_maps) self.old_expert_maps = new_expert_maps - logger.info("EPLB Process complete") + logger.info("EPLB Process compute complete") return update_info @@ -228,7 +229,7 @@ def local2global(self, class EplbProcess: - def __init__(self, shared_dict, planner_q, block_update_q, policy_type: int = 0, enable_d2d: bool = True): + def __init__(self, shared_dict, planner_q, block_update_q, redundant_enable, policy_type: int = 0, enable_d2d: bool = True): """ Args: shared_dict: Cross-process shared dict returned by Manager().dict() @@ -240,6 +241,7 @@ def __init__(self, shared_dict, planner_q, block_update_q, policy_type: int = 0, self.enable_d2d = enable_d2d self.planner_q = planner_q self.block_update_q = block_update_q + self.redundant_enable = redundant_enable # Create EplbWorker instance self.worker = EplbWorker(self.shared_dict, self.policy_type, self.enable_d2d) @@ -258,7 +260,8 @@ def worker_process(self, planner_q, block_update_q): update_info_list = [] for (send_info , recv_info , new_expert_map, layer_id) in update_info_generator: - update_info_list.append((send_info , recv_info , new_expert_map, layer_id)) + log2phy_map = ExpertMapUtils.generate_log2phy_map(new_expert_map) if self.redundant_enable else None + update_info_list.append((send_info , recv_info , new_expert_map, log2phy_map, layer_id)) while True: if not block_update_q.empty(): diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index d4faf05ff6..827ab3ead2 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -24,17 +24,18 @@ class EplbUpdator: - def __init__(self): - self.init_eplb() + def __init__(self, redundant_enable): + self.init_eplb(redundant_enable) def set_adaptor(self, adaptor): self.adaptor = adaptor self.eplb_loader = D2DExpertWeightLoader(eplb_adaptor=self.adaptor) self.num_moe_layers = self.adaptor.num_moe_layers - def init_eplb(self): + def init_eplb(self, redundant_enable): - self.num_iterations: torch.int64 = 10 + self.redundant_enable = redundant_enable + self.num_iterations: torch.int64 = 130 self.weight_update_counter = 0 self.expert_map_initialized = False @@ -62,6 +63,7 @@ def init_eplb(self): shared_dict = self.shared_dict, planner_q = self.planner_block_queue, block_update_q = self.block_update_queue, + redundant_enable = self.redundant_enable, policy_type = 0, enable_d2d = True ) @@ -94,9 +96,9 @@ def forward_before(self): self.update_info_all = self.block_update_queue.get() if self.update_in_flight and self.weight_update_counter < self.num_moe_layers: - (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.update_info_all.pop(0) + (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(0) rank_id = torch.distributed.get_rank() - self.eplb_loader.generate_log2phy_map(updated_expert_map) + self.eplb_loader.set_log2phy_map(log2phy_map) expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] #logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") @@ -118,7 +120,7 @@ def forward_end(self): self.wakeup_eplb_worker() self.update_in_flight = True - self.eplb_loader.update_expert_map_and_weight(self.reqs) + self.eplb_loader.update_expert_map_and_weight(self.reqs, self.redundant_enable) def compute_and_set_moe_load(self): local_load = self.adaptor.get_rank_expert_workload(self.num_moe_layers) diff --git a/vllm_ascend/eplb/tool/eplb_utils.py b/vllm_ascend/eplb/tool/eplb_utils.py new file mode 100644 index 0000000000..ff048eb4ac --- /dev/null +++ b/vllm_ascend/eplb/tool/eplb_utils.py @@ -0,0 +1,84 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import torch + +class ExpertMapUtils(): + + @classmethod + def generate_index_dicts(cls, tensor_2d): + dict_list = [] + current_idx = 0 + + for row in tensor_2d: + value_to_index = {} + for i in range(row.size(0)): + value = row[i].item() + value_to_index[value] = current_idx + i + dict_list.append(value_to_index) + current_idx += row.size(0) + + return dict_list + + @classmethod + def generate_log2phy_map(cls, expert_map): + num_local_experts = expert_map.max() + 1 + expert_map = cls.global2local(expert_map, num_local_experts) + ranks_num, global_expert_num = expert_map.shape + concatenated = torch.flatten(expert_map) + rank_expert_to_global = cls.generate_index_dicts(expert_map) + result_dict: Dict[int, List[int]] = {} + for idx, value in enumerate(concatenated): + key = value.item() + if key not in result_dict: + result_dict[key] = [] + result_dict[key].append(idx) + + log2phy_map = torch.full((ranks_num, self.global_expert_num), + -1, + dtype=torch.int32) + for rank in range(ranks_num): + for key in result_dict: + indices_in_concat = result_dict[key] + if key in rank_expert_to_global[rank]: + log2phy_map[rank][key] = rank_expert_to_global[rank][key] + else: + chosen_index = random.choice(indices_in_concat) + log2phy_map[rank][key] = chosen_index + return log2phy_map + + @classmethod + def global2local(cls, + placement: torch.Tensor, + E_local: int + ) -> tuple[torch.Tensor, torch.Tensor]: + + G, _ = placement.shape + device = placement.device + + pt_local = torch.full(( G, E_local), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement >= 0 + g_idx, k_idx = valid.nonzero(as_tuple=True) + slot_idx = placement[g_idx, k_idx] + + pt_local[g_idx, slot_idx] = k_idx + + return pt_local diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 9c010be90b..d56e8e0682 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -371,7 +371,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.dynamic_eplb = ascend_config.dynamic_eplb if self.dynamic_eplb == True: self.eplb_adaptor = None - self.eplb_updator = EplbUpdator() + self.eplb_updator = EplbUpdator(ascend_config.expert_map_path != None) def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler From 23ca68d0c8557a91b7213782de5c7fafa0cfb985 Mon Sep 17 00:00:00 2001 From: zzzzwwjj <34335947+zzzzwwjj@users.noreply.github.com> Date: Tue, 17 Jun 2025 17:49:03 +0800 Subject: [PATCH 078/153] [refactor] Refactoring AscendFusedMoE (#1229) ### What this PR does / why we need it? This PR is used for resolved [issue 1147](https://github.com/vllm-project/vllm-ascend/issues/1147) 1. Move fused_moe code into one file `fused_moe.py`. 2. Integrate branch conditions into function `get_fused_moe_state`. ### Does this PR introduce _any_ user-facing change? 1. This PR has removed the env `VLLM_ENABLE_MC2`, because I think this env is useless, we can make judgments based on the current scenario without this env, it will only increase complexity. 2. This PR has removed the env `USING_LCCL_COM`, because this env has already expired. 3. `additional_config.expert_tensor_parallel_size` has already expired, and now we also use parameter `enable_expert_parallel`, consistent with the vLLM. ### How was this patch tested? Signed-off-by: zzzzwwjj <1183291235@qq.com> --- vllm_ascend/attention/mla_v1.py | 3 + vllm_ascend/envs.py | 8 -- vllm_ascend/models/deepseek_dbo.py | 45 +------ vllm_ascend/models/deepseek_v2.py | 59 ++------- vllm_ascend/ops/fused_moe.py | 152 +++++++++++++---------- vllm_ascend/quantization/w8a8_dynamic.py | 12 +- vllm_ascend/utils.py | 19 +++ vllm_ascend/worker/model_runner_v1.py | 51 ++++---- vllm_ascend/worker/worker_v1.py | 15 ++- 9 files changed, 155 insertions(+), 209 deletions(-) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index f741508791..189aa38e89 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -136,6 +136,7 @@ class AscendMLAMetadata: # For logging. num_input_tokens: int = 0 # Number of tokens including padding. + max_num_tokens_across_dp: int = 0 with_prefill_across_dp: bool = False query_lens: Optional[list[int]] = None @@ -364,6 +365,7 @@ def build( common_attn_metadata: CommonAttentionMetadata, common_prefix_len: Optional[int] = None, graph_pad_size: int = -1, + max_num_tokens_across_dp: int = 0, with_prefill_across_dp: bool = False, ) -> AscendMLAMetadata: assert self._num_decodes + self._num_prefills == num_reqs @@ -509,6 +511,7 @@ def build( query_start_loc=query_start_loc, block_tables=block_table, seq_lens=seq_lens, + max_num_tokens_across_dp=max_num_tokens_across_dp, with_prefill_across_dp=with_prefill_across_dp, ) diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 8d7c10eb20..74e9c1969e 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -50,18 +50,10 @@ # value is None, which means the system default C compiler will be used. "C_COMPILER": lambda: os.getenv("C_COMPILER", None), - # Whether to enable MC2 for DeepSeek. If not set, the default value is False. - # MC2 is a fusion operator provided by Ascend to speed up computing and communication. - # Find more detail here: https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/ascendcbestP/atlas_ascendc_best_practices_10_0043.html - "VLLM_ENABLE_MC2": - lambda: bool(int(os.getenv("VLLM_ENABLE_MC2", '0'))), # Whether to enable the topk optimization. It's disabled by default for experimental support # We'll make it enabled by default in the future. "VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE", '0'))), - # Whether to use LCCL communication. If not set, the default value is False. - "USING_LCCL_COM": - lambda: bool(int(os.getenv("USING_LCCL_COM", '0'))), # The version of the Ascend chip. If not set, the default value is # ASCEND910B1. It's used for package building. Please make sure that the # version is correct. diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py index 9db49cbff6..6ab0837e37 100644 --- a/vllm_ascend/models/deepseek_dbo.py +++ b/vllm_ascend/models/deepseek_dbo.py @@ -51,9 +51,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.models.deepseek_v2 import \ - DeepseekV2ForCausalLM # ruff: noqa: E501 + DeepseekV2ForCausalLM # noqa: E501 from vllm.model_executor.models.deepseek_v2 import \ - yarn_get_mscale # ruff: noqa: E501 + yarn_get_mscale # noqa: E501 from vllm.model_executor.models.deepseek_v2 import (DeepseekV2Attention, DeepseekV2DecoderLayer, DeepseekV2MLAAttention) @@ -79,7 +79,6 @@ from vllm_ascend.utils import dispose_tensor VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO -VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 class CustomDeepseekDBOMLP(CustomDeepseekV2MLP): @@ -189,26 +188,8 @@ def forward( if hasattr(attn_metadata, 'with_prefill_across_dp'): is_prefill = is_prefill or attn_metadata.with_prefill_across_dp - num_tokens, hidden_size = hidden_states.shape - old_hidden_states = hidden_states.clone() - if self.tp_size > 1: - if envs_ascend.VLLM_ENABLE_MC2 and not is_prefill: - chunks = torch.chunk(hidden_states, self.tp_size, dim=0) - hidden_states = chunks[self.tp_rank] - elif not self.torchair_graph_enabled: - num_padding_tokens = (self.tp_size - - num_tokens % self.tp_size) % self.tp_size - # Pad hidden_states to make it divisible by tp_size to avoid cross-ring AllGatherV on 910B2C - if num_padding_tokens > 0: - hidden_states = nn.functional.pad( - hidden_states, (0, 0, 0, num_padding_tokens)) - chunk_hidden_states = torch.tensor_split(hidden_states, - self.tp_size, - dim=0) - hidden_states = chunk_hidden_states[self.tp_rank] - # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) @@ -220,33 +201,13 @@ def forward( enable_force_load_balance=enable_force_load_balance, ) * self.routed_scaling_factor - if self.tp_size > 1: - if self.torchair_graph_enabled: - if envs_ascend.VLLM_ENABLE_MC2 and not is_prefill: - final_hidden_states = torch.zeros( - [num_tokens, hidden_size], - dtype=self.params_dtype, - device="npu") - dist.all_gather_into_tensor(final_hidden_states, - hidden_states, self.tp_group) - hidden_states = final_hidden_states - else: - hidden_states = tensor_model_parallel_all_reduce( - hidden_states) - else: - dist.all_gather(list(chunk_hidden_states), hidden_states, - self.tp_group) - hidden_states = torch.cat(chunk_hidden_states, dim=0) - if num_padding_tokens > 0: - hidden_states = hidden_states[:-num_padding_tokens] - if self.n_shared_experts is not None: shared_output = self.shared_experts(old_hidden_states) if shared_output is not None: hidden_states = hidden_states + shared_output - return hidden_states.view(num_tokens, hidden_size) + return hidden_states # ----------------------------------------- TBO-related -------------------------------------------- def _forward_ms_op_shared_expert( diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 0ae1142a3d..e96b2e9847 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -28,7 +28,6 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch -import torch.distributed as dist import torch_npu import vllm.envs as envs from torch import nn @@ -37,7 +36,7 @@ from vllm.config import CacheConfig, ModelConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_world_size, - get_tp_group, tensor_model_parallel_all_reduce) + get_tp_group) from vllm.distributed.parallel_state import get_dp_group from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import SiluAndMul @@ -54,9 +53,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.models.deepseek_v2 import \ - DeepseekV2ForCausalLM # ruff: noqa: E501 + DeepseekV2ForCausalLM # noqa: E501 from vllm.model_executor.models.deepseek_v2 import \ - yarn_get_mscale # ruff: noqa: E501 + yarn_get_mscale # noqa: E501 from vllm.model_executor.models.deepseek_v2 import (DeepseekV2Attention, DeepseekV2DecoderLayer, DeepseekV2MLAAttention) @@ -65,7 +64,6 @@ maybe_prefix) from vllm.sequence import IntermediateTensors -import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_ep_group from vllm_ascend.ops.fused_moe import AscendFusedMoE @@ -74,8 +72,6 @@ from vllm_ascend.utils import (dispose_tensor, npu_stream_switch, npu_wait_tensor) -VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 - class CustomDeepseekV2SiluAndMul(SiluAndMul): @@ -240,9 +236,8 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on self.enable_multistream_moe = \ - ascend_config.torchair_graph_config.enable_multistream_moe and VLLM_ENABLE_MC2 + ascend_config.torchair_graph_config.enable_multistream_moe self.gate = ReplicatedLinear(config.hidden_size, config.n_routed_experts, @@ -312,22 +307,6 @@ def forward( enable_force_load_balance = False if hasattr(attn_metadata, 'with_prefill_across_dp'): is_prefill = is_prefill or attn_metadata.with_prefill_across_dp - num_tokens, hidden_size = hidden_states.shape - old_hidden_states = hidden_states - use_separated_shared_experts = (self.shared_experts is not None - and not self.enable_multistream_moe) - - if self.tp_size > 1: - if (VLLM_ENABLE_MC2 - and not is_prefill) or not (self.torchair_graph_enabled or - self.ep_group.world_size == 1): - if num_tokens < self.tp_size: - hidden_states = nn.functional.pad( - hidden_states, (0, 0, 0, self.tp_size - num_tokens)) - chunk_hidden_states = torch.tensor_split(hidden_states, - self.tp_size, - dim=0) - hidden_states = chunk_hidden_states[self.tp_rank] # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) @@ -338,34 +317,14 @@ def forward( is_prefill=is_prefill, top_k=CustomDeepseekV2MoE.top_k, enable_force_load_balance=enable_force_load_balance, - shared_experts=(self.shared_experts - if not use_separated_shared_experts else None), + shared_experts=self.shared_experts, ) - if not isinstance(experts_hidden_states, tuple): - hidden_states = experts_hidden_states * self.routed_scaling_factor - else: - hidden_states = ( - experts_hidden_states[0] * self.routed_scaling_factor + - experts_hidden_states[1]) - - if self.tp_size > 1: - if (VLLM_ENABLE_MC2 - and not is_prefill) or not (self.torchair_graph_enabled or - self.ep_group.world_size == 1): - dist.all_gather(list(chunk_hidden_states), hidden_states, - self.tp_group) - hidden_states = torch.cat(chunk_hidden_states, dim=0) - if num_tokens < self.tp_size: - hidden_states = hidden_states[:num_tokens] - else: - hidden_states = tensor_model_parallel_all_reduce(hidden_states) + hidden_states = ( + experts_hidden_states[0] * self.routed_scaling_factor + + experts_hidden_states[1]) - if use_separated_shared_experts: - hidden_states = hidden_states + self.shared_experts( - old_hidden_states) - - return hidden_states.view(num_tokens, hidden_size) + return hidden_states class CustomDeepseekV2MLAAttention(DeepseekV2MLAAttention): diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index d6115d35c1..4a4b4887f5 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -21,11 +21,13 @@ import torch import torch.distributed as dist import torch_npu +from torch import nn from vllm.config import get_current_vllm_config -from vllm.distributed import (GroupCoordinator, +from vllm.distributed import (GroupCoordinator, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) -from vllm.distributed.parallel_state import get_dp_group +from vllm.distributed.parallel_state import get_dp_group, get_tp_group +from vllm.forward_context import get_forward_context from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEParallelConfig, MoEConfig, UnquantizedFusedMoEMethod, determine_expert_map) @@ -36,10 +38,10 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer -from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor +from vllm_ascend.utils import (FusedMoEState, dispose_tensor, + get_fused_moe_state, npu_stream_switch, + npu_wait_tensor) -VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 -USING_LCCL_COM: bool = envs_ascend.USING_LCCL_COM MOE_ALL2ALL_BUFFER: bool = envs_ascend.MOE_ALL2ALL_BUFFER @@ -845,8 +847,8 @@ def __init__(self, moe: MoEConfig = None): super().__init__(moe=moe) vllm_config = get_current_vllm_config() - ep_group = get_ep_group() - self.ep_size = ep_group.world_size + self.ep_group = get_ep_group() + self.ep_size = self.ep_group.world_size self.global_batch_size = vllm_config.scheduler_config.max_num_seqs self.local_batch_size = self.global_batch_size // self.ep_size self.max_model_len = vllm_config.model_config.max_model_len @@ -855,7 +857,7 @@ def __init__(self, moe: MoEConfig = None): self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled try: - device_group = ep_group.device_group + device_group = self.ep_group.device_group # TODO: Try local_rank = ep_group.rank_in_group local_rank = torch.distributed.get_rank(group=device_group) backend = device_group._get_backend(torch.device("npu")) @@ -931,7 +933,9 @@ def apply( if enable_force_load_balance: topk_ids = torch.randint_like(topk_ids, 0, global_num_experts) - if VLLM_ENABLE_MC2 and not is_prefill: + fused_moe_state = get_fused_moe_state(self.ep_group.world_size, + is_prefill) + if fused_moe_state == FusedMoEState.MC2: return fused_experts_with_mc2( hidden_states=x, w1=layer.w13_weight, @@ -942,7 +946,7 @@ def apply( expert_map=expert_map, moe_all_to_all_group_name=self.moe_all_to_all_group_name, shared_experts=shared_experts) - elif self.torchair_graph_enabled or get_ep_group().world_size == 1: + elif fused_moe_state == FusedMoEState.AllGather: return fused_experts(hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, @@ -1022,9 +1026,6 @@ def __init__( get_dp_group().world_size), vllm_parallel_config=vllm_config.parallel_config)) - self.moe_parallel_config.ep_size = get_ep_group().world_size - self.moe_parallel_config.tp_size = get_etp_group().world_size - self.top_k = top_k self.num_experts = num_experts self.global_num_experts = num_experts @@ -1066,10 +1067,9 @@ def __init__( self.ep_size, get_ep_group().rank_in_group, self.global_num_experts) - self.moe_parallel_config.tp_rank = get_etp_group().rank_in_group - self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group - self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_multistream_moe = \ + ascend_config.torchair_graph_config.enable_multistream_moe if self.scoring_func != "softmax" and not self.use_grouped_topk: raise ValueError("Only softmax scoring function is supported for " @@ -1109,6 +1109,8 @@ def __init__( moe_quant_params["intermediate_size_full"] = intermediate_size self.ep_group = get_ep_group() + # NOTE: self.tp_group is not expert_tp_group + self.tp_group = get_tp_group().device_group self.quant_method.create_weights(layer=self, **moe_quant_params) def forward(self, @@ -1125,25 +1127,45 @@ def forward(self, else: real_top_k = self.top_k - # MC2 ag/rs broadcast/all_reduce - # prefill_req x x √ - # decode_req √ x √ - # graph_mode √ √ x - if self.dp_size > 1: - if VLLM_ENABLE_MC2 and not is_prefill: - ... - elif self.torchair_graph_enabled: - if USING_LCCL_COM: # type: ignore - hidden_states = get_dp_group().all_gather( - hidden_states, 0, False) - router_logits = get_dp_group().all_gather( - router_logits, 0, False) - elif self.torchair_graph_enabled and not is_prefill: - hidden_states = get_dp_group().all_gather(hidden_states, 0) - router_logits = get_dp_group().all_gather(router_logits, 0) - else: - hidden_states, router_logits = get_ep_group().dispatch( - hidden_states, router_logits) + num_tokens, hidden_size = hidden_states.shape + + fused_moe_state = get_fused_moe_state(self.moe_parallel_config.ep_size, + is_prefill) + if shared_experts: + if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2: + shared_hidden_states = shared_experts(hidden_states) + + tp_size = get_tensor_model_parallel_world_size() + if tp_size > 1 and fused_moe_state != FusedMoEState.AllGather: + if num_tokens < tp_size: + hidden_states = nn.functional.pad( + hidden_states, (0, 0, 0, tp_size - num_tokens)) + router_logits = nn.functional.pad( + router_logits, (0, 0, 0, tp_size - num_tokens)) + chunk_hidden_states = torch.tensor_split(hidden_states, + tp_size, + dim=0) + chunk_router_logits = torch.tensor_split(router_logits, + tp_size, + dim=0) + tp_rank = get_tensor_model_parallel_rank() + hidden_states = chunk_hidden_states[tp_rank] + router_logits = chunk_router_logits[tp_rank] + if self.dp_size > 1 and fused_moe_state == FusedMoEState.AllGather: + # NOTE: When in torchair graph, it has been padded in model_runner_v1 + if not self.torchair_graph_enabled or is_prefill: + attn_metadata = get_forward_context().attn_metadata + if attn_metadata is not None: + max_num_tokens_across_dp = attn_metadata.max_num_tokens_across_dp + if num_tokens < max_num_tokens_across_dp: + hidden_states = nn.functional.pad( + hidden_states, + (0, 0, 0, max_num_tokens_across_dp - num_tokens)) + router_logits = nn.functional.pad( + router_logits, + (0, 0, 0, max_num_tokens_across_dp - num_tokens)) + hidden_states = get_dp_group().all_gather(hidden_states, 0) + router_logits = get_dp_group().all_gather(router_logits, 0) # Matrix multiply. e_hidden_states = self.quant_method.apply( @@ -1167,36 +1189,36 @@ def forward(self, shared_experts=shared_experts, ) - if shared_experts is not None: - # Provide dummy implementation of "non-separated" shared experts. - if not isinstance(e_hidden_states, tuple): - return e_hidden_states, shared_experts(hidden_states) - else: - return e_hidden_states - - if self.dp_size > 1: - if VLLM_ENABLE_MC2 and not is_prefill: - ... - elif self.torchair_graph_enabled: - if USING_LCCL_COM: # type: ignore - e_hidden_states = dist._functional_collectives.reduce_scatter_tensor( - e_hidden_states, - "sum", - scatter_dim=0, - group=get_dp_group().device_group) - elif self.torchair_graph_enabled and not is_prefill: - e_hidden_states = dist._functional_collectives.reduce_scatter_tensor( - e_hidden_states, - "sum", - scatter_dim=0, - group=get_dp_group().device_group) - else: - e_hidden_states = get_ep_group().combine(e_hidden_states) - - if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1): - e_hidden_states = tensor_model_parallel_all_reduce(e_hidden_states) - - return e_hidden_states + if shared_experts: + if isinstance(e_hidden_states, tuple): + e_hidden_states, shared_hidden_states = e_hidden_states + + if tp_size > 1 and fused_moe_state != FusedMoEState.AllGather: + dist.all_gather(list(chunk_hidden_states), e_hidden_states, + self.tp_group) + final_hidden_states = torch.cat(chunk_hidden_states, dim=0) + if num_tokens < tp_size: + final_hidden_states = final_hidden_states[:num_tokens] + dispose_tensor(e_hidden_states) + elif self.dp_size > 1 and fused_moe_state == FusedMoEState.AllGather: + final_hidden_states = dist._functional_collectives.reduce_scatter_tensor( + e_hidden_states, + "sum", + scatter_dim=0, + group=get_dp_group().device_group) + final_hidden_states = final_hidden_states[:num_tokens] + dispose_tensor(e_hidden_states) + else: + final_hidden_states = e_hidden_states + + if tp_size > 1 and fused_moe_state == FusedMoEState.AllGather: + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + + if shared_experts: + return final_hidden_states, shared_hidden_states + else: + return final_hidden_states # ----------------------------------------- TBO-related -------------------------------------------- diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 66a0a302c8..6c44a6a8bd 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -22,15 +22,13 @@ import torch_npu from vllm.distributed import GroupCoordinator -import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_ep_group from vllm_ascend.ops.fused_moe import select_experts -from vllm_ascend.utils import (dispose_tensor, npu_stream_switch, +from vllm_ascend.utils import (FusedMoEState, dispose_tensor, + get_fused_moe_state, npu_stream_switch, npu_wait_tensor) -VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 - def apply_mlp(hidden_states: torch.Tensor, w1: torch.Tensor, @@ -660,7 +658,9 @@ def apply( topk_weights = topk_weights.to(x.dtype) - if VLLM_ENABLE_MC2 and not is_prefill: + fused_moe_state = get_fused_moe_state(self.ep_group.world_size, + is_prefill) + if fused_moe_state == FusedMoEState.MC2: return fused_experts_with_mc2( hidden_states=x, w1=layer.w13_weight, @@ -675,7 +675,7 @@ def apply( log2phy=log2phy, global_redundant_expert_num=global_redundant_expert_num, shared_experts=shared_experts) - elif self.torchair_graph_enabled or self.ep_group.world_size == 1: + elif fused_moe_state == FusedMoEState.AllGather: return fused_experts(hidden_states=x, w1=layer.w13_weight, w1_scale=layer.w13_weight_scale, diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index d932053520..eeab287906 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -20,6 +20,7 @@ import atexit import math from contextlib import contextmanager, nullcontext +from enum import Enum from threading import Lock from typing import TYPE_CHECKING, List, Tuple @@ -275,3 +276,21 @@ def npu_wait_tensor(self: torch.Tensor, *, enabled: bool = True): return _npu_wait_tensor(self, dependency) if enabled else self + + +# TODO(zzzzwwjj): move this into forward_context +class FusedMoEState(Enum): + AllGather = 0 + All2All = 1 + MC2 = 2 + + +# TODO(zzzzwwjj): add soc_version to choose branch +def get_fused_moe_state(ep_size: int, with_prefill: bool): + if ep_size == 1: + return FusedMoEState.AllGather + # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph. + elif ep_size < 16 or with_prefill: + return FusedMoEState.All2All + else: + return FusedMoEState.MC2 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 6d226da0bf..801819b039 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -348,15 +348,10 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.init_torchair_graph_batch_sizes() if len(self.torchair_graph_batch_sizes) == 0: - #If MC2 is enabled, torchair_graph_batch_size should pad to tp_size - if envs_ascend.VLLM_ENABLE_MC2: - self.torchair_graph_batch_sizes = [ - self.scheduler_config.max_num_seqs - ] - else: - self.torchair_graph_batch_sizes = [ - 1, self.scheduler_config.max_num_seqs - ] + # TODO(zzzzwwjj): check torchair_graph_batch_sizes init code + self.torchair_graph_batch_sizes = [ + self.scheduler_config.max_num_seqs + ] torch._dynamo.cache_size.config.cache_size_limit += len( self.torchair_graph_batch_sizes) @@ -569,10 +564,12 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: self.input_batch.refresh_sampling_metadata() def _get_forward_metadata_across_dp( - self, batch_size: int, with_prefill: bool) -> tuple[int, bool]: - forward_metadata = torch.tensor([batch_size, with_prefill], - device="cpu", - dtype=torch.int32) + self, total_num_scheduled_tokens: int, + with_prefill: bool) -> tuple[int, bool]: + forward_metadata = torch.tensor( + [total_num_scheduled_tokens, with_prefill], + device="cpu", + dtype=torch.int32) dist.all_reduce(forward_metadata, op=ReduceOp.MAX, group=get_dp_group().cpu_group) @@ -901,11 +898,11 @@ def _process_reqs( if self.dp_size > 1: max_num_tokens, with_prefill = self._get_forward_metadata_across_dp( total_num_scheduled_tokens, with_prefill) + extra_builder_kwargs['max_num_tokens_across_dp'] = max_num_tokens extra_builder_kwargs['with_prefill_across_dp'] = with_prefill # Add graph_pad_size here - if envs_ascend.VLLM_ENABLE_MC2 or (self.torchair_graph_enabled - and not with_prefill): + if self.torchair_graph_enabled and not with_prefill: if self.dp_size > 1: padded_batch_size = self.select_torchair_padded_batch_size( max_num_tokens) @@ -984,8 +981,7 @@ def _process_reqs( else: positions = self.positions[:num_input_tokens] - if (envs_ascend.VLLM_ENABLE_MC2 - or self.torchair_graph_enabled) and not with_prefill: + if self.torchair_graph_enabled and not with_prefill: input_ids = self.input_ids[:padded_batch_size] positions = self.positions[:padded_batch_size] @@ -1885,20 +1881,15 @@ def _generate_mtp_token_ids( return spec_token_ids def init_torchair_graph_batch_sizes(self): + start_graph_batch_size = 4 tp_size = get_tensor_model_parallel_world_size() - batch_size_step = 8 - largest_batch_size = 1 - - if envs_ascend.VLLM_ENABLE_MC2: - batch_size_step = max(batch_size_step, tp_size) - largest_batch_size = batch_size_step - while (largest_batch_size < 8): - self.torchair_graph_batch_sizes.append(largest_batch_size) - largest_batch_size *= 2 - - while (largest_batch_size <= self.scheduler_config.max_num_seqs): - self.torchair_graph_batch_sizes.append(largest_batch_size) - largest_batch_size += batch_size_step + + # NOTE: When use all2all | mc2, We need to slice the `num_tokens` dimension into `tp_size` blocks + start_graph_batch_size = max(start_graph_batch_size, tp_size) + + while (start_graph_batch_size <= self.scheduler_config.max_num_seqs): + self.torchair_graph_batch_sizes.append(start_graph_batch_size) + start_graph_batch_size *= 2 def select_torchair_padded_batch_size(self, batch_size: int): selected_batch_size = self.max_num_reqs diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 76844c96a4..6fe84a4580 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -38,7 +38,6 @@ from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.worker_base import WorkerBase -import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.device_allocator.camem import CaMemAllocator from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel @@ -247,15 +246,15 @@ def pin_lora(self, lora_id: int) -> bool: def execute_dummy_batch(self) -> None: runner = self.model_runner - num_tokens = 1 + max_num_tokens = 1 + with_prefill = False if runner.dp_size > 1: max_num_tokens, with_prefill = runner._get_forward_metadata_across_dp( - 1, False) - if envs_ascend.VLLM_ENABLE_MC2 or runner.torchair_graph_enabled: - if not with_prefill: - num_tokens = max_num_tokens - num_tokens = runner.select_torchair_padded_batch_size(num_tokens) - runner._dummy_run(num_tokens, + max_num_tokens, with_prefill) + if runner.torchair_graph_enabled and not with_prefill: + max_num_tokens = runner.select_torchair_padded_batch_size( + max_num_tokens) + runner._dummy_run(max_num_tokens, is_compile=False, with_prefill=with_prefill) From 7f77443ab84740ddd67210b68f9aa6123ac1417d Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Mon, 16 Jun 2025 20:48:56 +0800 Subject: [PATCH 079/153] fix bugsw --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index b9d798fbe6..65c3486bcf 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -102,4 +102,3 @@ def do_update_log2phy_map(self, layer_id, updated_log2phy_map): rank_id = torch.distributed.get_rank() if self.log2phy_map_per_layer[layer_id] is not None: self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map[rank_id]) - From 0b8d00aa694bf6ea7122202f8a4634849e7350f8 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Mon, 16 Jun 2025 22:09:22 +0800 Subject: [PATCH 080/153] improve the implement of communication between main process and eplb process --- vllm_ascend/eplb/eplb_updator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 827ab3ead2..8d5fe835e7 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -96,7 +96,7 @@ def forward_before(self): self.update_info_all = self.block_update_queue.get() if self.update_in_flight and self.weight_update_counter < self.num_moe_layers: - (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(0) + (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.update_info_all.pop(0) rank_id = torch.distributed.get_rank() self.eplb_loader.set_log2phy_map(log2phy_map) expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] From 447360f5375f7cd93b211e4c86a4fad2e9295d36 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Tue, 17 Jun 2025 19:04:26 +0800 Subject: [PATCH 081/153] avoid frequetly synchronize between device and cpu when accessing to expert map --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 19 ++++++++++++------- .../core/loader/device_transfer_loader.py | 10 +++++----- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index d42482a37a..50793ebede 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -29,6 +29,7 @@ class VllmEplbAdaptor(EplbAdaptor): def __init__(self, model, **args): super().__init__(**args) self.model = model + self.rank_id = torch.distributed.get_rank() self.param_dict = dict(self.model.named_parameters()) self.num_dense_layers = self.model.config.first_k_dense_replace self.num_moe_layers = self.model.config.num_hidden_layers - self.num_dense_layers @@ -43,7 +44,8 @@ def __init__(self, model, **args): self.expert_map_per_layer[self.num_dense_layers + layer_idx] =\ self.model.get_expert_map(self.num_dense_layers + layer_idx) - self.buffer_tensor_dict = dict() + self.buffer_tensor_dict = dict() # reference to expert map on device for expert map update + self.buffer_tensor_dict_cpu = dict() # copy of expert map on CPU to avoid device synchronize frequently # TODO: here we set number of buffer tensor equal to number of expert in each laryer, which can be improved num_buffer_tensor = torch.where(self.expert_map_per_layer[self.num_dense_layers] != -1)[0].numel() self.init_buffer_tensor_dict(num_buffer_tensor) @@ -66,7 +68,7 @@ def get_buffer_tensor(self, buffer_tensor_id): def get_expert_tensor(self, layer_id, global_expert_id_to_send): for name in self.expert_weight_names: complete_name = "model.layers." + str(layer_id) + ".mlp.experts." + name - local_expert_id = self.expert_map_per_layer[layer_id][global_expert_id_to_send].item() + local_expert_id = self.expert_map_per_layer_cpu[layer_id][global_expert_id_to_send].item() yield self.param_dict[complete_name].data[local_expert_id] def get_rank_expert_workload(self, num_moe_layers): @@ -86,16 +88,20 @@ def get_init_expert_map(self, num_moe_layers): all_maps = gathered.permute(1, 0, 2) all_expert_maps = all_maps.cpu() + for layer_idx in range(num_moe_layers): + self.log2phy_map_per_layer_cpu[self.num_dense_layers + layer_idx] = \ + all_expert_maps[layer_idx][self.rank_id] + return all_expert_maps def do_update_expert_map(self, layer_id, updated_expert_map): self.expert_map_per_layer[layer_id].copy_(updated_expert_map) + self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map) - def do_update_expert_weight(self, layer_id, expert_id_before_replace, buffer_tensor_id): + def do_update_expert_weight(self, layer_id, local_expert_to_replace, buffer_tensor_id): for name in self.expert_weight_names: complete_name = "model.layers." + str(layer_id) + ".mlp.experts." + name - local_expert_id = self.expert_map_per_layer[layer_id][expert_id_before_replace].item() - expert_tensor = self.param_dict[complete_name].data[local_expert_id] + expert_tensor = self.param_dict[complete_name].data[local_expert_to_replace] expert_tensor.copy_(self.buffer_tensor_dict[name][buffer_tensor_id]) def generate_index_dicts(self, tensor_2d): @@ -140,9 +146,8 @@ def generate_log2phy_map(self, expert_map): return log2phy_map def do_update_log2phy_map(self, layer_id, updated_log2phy_map): - rank_id = torch.distributed.get_rank() if self.log2phy_map_per_layer[layer_id] is not None: - self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map[rank_id]) + self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map[self.rank_id]) def global2local(self, placement: torch.Tensor, diff --git a/vllm_ascend/eplb/core/loader/device_transfer_loader.py b/vllm_ascend/eplb/core/loader/device_transfer_loader.py index d7afb47e5f..0c8e963a33 100644 --- a/vllm_ascend/eplb/core/loader/device_transfer_loader.py +++ b/vllm_ascend/eplb/core/loader/device_transfer_loader.py @@ -64,7 +64,8 @@ def generate_expert_d2d_transfer_task(self, expert_send_info, expert_recv_info, recv_rank, global_expert_id_to_recv = recv_info for buffer_tensor in self.eplb_adaptor.get_buffer_tensor(buffer_tensor_id): self.comm_op_list.append(dist.P2POp(dist.irecv, buffer_tensor, recv_rank)) - self.recv_expert_list.append((global_expert_id_to_recv, buffer_tensor_id)) + local_expert_to_replace = self.updated_expert_map[global_expert_id_to_recv].item() + self.recv_expert_list.append((local_expert_to_replace, buffer_tensor_id)) buffer_tensor_id += 1 self.state = ExpertWeightUpdateState.READY @@ -104,9 +105,9 @@ def update_expert_map_and_weight(self, reqs): # update expert weight buffer_tensor_id = 0 for recv_expert_info in self.recv_expert_list: - global_expert_id_to_recv, buffer_tensor_id = recv_expert_info - self.eplb_adaptor.do_update_expert_weight(self.layer_id, global_expert_id_to_recv, buffer_tensor_id) - + local_expert_to_replace, buffer_tensor_id = recv_expert_info + self.eplb_adaptor.do_update_expert_weight(self.layer_id, local_expert_to_replace, buffer_tensor_id) + logger.info(f"[EPLB] finished update expert weight for layer: {self.layer_id}") self.recv_expert_list = [] @@ -149,4 +150,3 @@ def generate_mock_update_info(self, rank_id): def load_impl(self, old_expert_table, new_expert_table): raise NotImplementedError - \ No newline at end of file From 40ee72fb2fd72a05e313874f8b9e93c09f7ae0b1 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Tue, 17 Jun 2025 19:23:02 +0800 Subject: [PATCH 082/153] add gate for cacluate moe load --- vllm_ascend/ops/fused_moe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 1a02b7a144..84bcc9dc86 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1054,6 +1054,7 @@ def __init__( ascend_config = get_ascend_config() expert_map_path = ascend_config.expert_map_path + self.dynamic_eplb = ascend_config.dynamic_eplb if expert_map_path and os.path.exists(expert_map_path): # moe expert load balance expert_load_balancer = ExpertLoadBalancer(expert_map_path, @@ -1181,7 +1182,9 @@ def forward(self, else: return e_hidden_states - self.calculate_moe_load() + #EPLB calculate moe load + if self.dynamic_eplb == True: + self.calculate_moe_load() # if self.enable_multistream_shared_expert and not is_prefill: # hidden_states, shared_output = hidden_states From 4380fddf72db84c1ca7014405195ba8e3c628258 Mon Sep 17 00:00:00 2001 From: raindaywhu Date: Tue, 17 Jun 2025 19:30:17 +0800 Subject: [PATCH 083/153] fix log2phy --- vllm_ascend/eplb/eplb_updator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 8d5fe835e7..1a0e0ac418 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -98,7 +98,6 @@ def forward_before(self): if self.update_in_flight and self.weight_update_counter < self.num_moe_layers: (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.update_info_all.pop(0) rank_id = torch.distributed.get_rank() - self.eplb_loader.set_log2phy_map(log2phy_map) expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] #logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") From 49efd9b86cc0b25c61a52483cd45bbff47496988 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Tue, 17 Jun 2025 20:27:28 +0800 Subject: [PATCH 084/153] fix bugs in expert_map_per_layer_cpu --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 50793ebede..0df2dfb6d7 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -39,13 +39,13 @@ def __init__(self, model, **args): self.expert_weight_names = ["w13_weight", "w2_weight", "w13_weight_scale", "w13_weight_offset", "w2_weight_scale", "w2_weight_offset"] - self.expert_map_per_layer = dict() + self.expert_map_per_layer = dict() # reference to expert map on device for expert map update + self.expert_map_per_layer_cpu = dict() # copy of expert map on CPU to avoid device synchronize frequently for layer_idx in range(self.num_moe_layers): self.expert_map_per_layer[self.num_dense_layers + layer_idx] =\ self.model.get_expert_map(self.num_dense_layers + layer_idx) - self.buffer_tensor_dict = dict() # reference to expert map on device for expert map update - self.buffer_tensor_dict_cpu = dict() # copy of expert map on CPU to avoid device synchronize frequently + self.buffer_tensor_dict = dict() # TODO: here we set number of buffer tensor equal to number of expert in each laryer, which can be improved num_buffer_tensor = torch.where(self.expert_map_per_layer[self.num_dense_layers] != -1)[0].numel() self.init_buffer_tensor_dict(num_buffer_tensor) @@ -89,7 +89,7 @@ def get_init_expert_map(self, num_moe_layers): all_expert_maps = all_maps.cpu() for layer_idx in range(num_moe_layers): - self.log2phy_map_per_layer_cpu[self.num_dense_layers + layer_idx] = \ + self.expert_map_per_layer_cpu[self.num_dense_layers + layer_idx] = \ all_expert_maps[layer_idx][self.rank_id] return all_expert_maps From d18aef1a743e148a2370b18c4ed3f6e9c7ff7c6e Mon Sep 17 00:00:00 2001 From: raindaywhu Date: Tue, 17 Jun 2025 20:31:02 +0800 Subject: [PATCH 085/153] fix log2phy --- vllm_ascend/eplb/tool/eplb_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/tool/eplb_utils.py b/vllm_ascend/eplb/tool/eplb_utils.py index ff048eb4ac..7bf4184d7e 100644 --- a/vllm_ascend/eplb/tool/eplb_utils.py +++ b/vllm_ascend/eplb/tool/eplb_utils.py @@ -16,6 +16,7 @@ # import torch +import random class ExpertMapUtils(): @@ -48,7 +49,7 @@ def generate_log2phy_map(cls, expert_map): result_dict[key] = [] result_dict[key].append(idx) - log2phy_map = torch.full((ranks_num, self.global_expert_num), + log2phy_map = torch.full((ranks_num, global_expert_num), -1, dtype=torch.int32) for rank in range(ranks_num): From 53d02185327d4fb9badcc34a4c6b955e41f9fdad Mon Sep 17 00:00:00 2001 From: NPUDev Date: Tue, 17 Jun 2025 21:03:09 +0800 Subject: [PATCH 086/153] fix log2phy --- vllm_ascend/eplb/tool/eplb_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/eplb/tool/eplb_utils.py b/vllm_ascend/eplb/tool/eplb_utils.py index 7bf4184d7e..e36a97b45a 100644 --- a/vllm_ascend/eplb/tool/eplb_utils.py +++ b/vllm_ascend/eplb/tool/eplb_utils.py @@ -38,10 +38,10 @@ def generate_index_dicts(cls, tensor_2d): @classmethod def generate_log2phy_map(cls, expert_map): num_local_experts = expert_map.max() + 1 - expert_map = cls.global2local(expert_map, num_local_experts) ranks_num, global_expert_num = expert_map.shape - concatenated = torch.flatten(expert_map) - rank_expert_to_global = cls.generate_index_dicts(expert_map) + local_expert_map = cls.global2local(expert_map, num_local_experts) + concatenated = torch.flatten(local_expert_map) + rank_expert_to_global = cls.generate_index_dicts(local_expert_map) result_dict: Dict[int, List[int]] = {} for idx, value in enumerate(concatenated): key = value.item() From af10b4a0552456b4ea560eed67d26419c1235138 Mon Sep 17 00:00:00 2001 From: raindaywhu Date: Tue, 17 Jun 2025 21:13:10 +0800 Subject: [PATCH 087/153] mv log2phy into eplb worker --- vllm_ascend/eplb/eplb_updator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 1a0e0ac418..827ab3ead2 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -96,8 +96,9 @@ def forward_before(self): self.update_info_all = self.block_update_queue.get() if self.update_in_flight and self.weight_update_counter < self.num_moe_layers: - (expert_send_info, expert_recv_info, updated_expert_map, layer_id) = self.update_info_all.pop(0) + (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(0) rank_id = torch.distributed.get_rank() + self.eplb_loader.set_log2phy_map(log2phy_map) expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] #logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") From f8029945c32cc53ae5b6f6479f8a5ab4f731cb69 Mon Sep 17 00:00:00 2001 From: Li Wang Date: Tue, 17 Jun 2025 21:25:40 +0800 Subject: [PATCH 088/153] [Bugfix] Remove cuda related lines and add additional pip mirror (#1252) ### What this PR does / why we need it? - For npu environment, we should use `PYTORCH_NPU_ALLOC_CONF ` rather than `PYTORCH_CUDA_ALLOC_CONF` - Add `PIP_EXTRA_INDEX_URL` to make nightly_benchmarks happy --------- Signed-off-by: wangli --- .github/workflows/nightly_benchmarks.yaml | 2 ++ vllm_ascend/device_allocator/camem.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 6e03c1e6be..2b9c062957 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -167,6 +167,8 @@ jobs: cp -r benchmarks/* /github/home/benchmarks/ - name: Run benchmark iteration + env: + PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi if: github.event_name != 'pull_request' run: | while IFS= read -r line || [[ -n "$line" ]]; do diff --git a/vllm_ascend/device_allocator/camem.py b/vllm_ascend/device_allocator/camem.py index 7592a04365..7156602ecd 100644 --- a/vllm_ascend/device_allocator/camem.py +++ b/vllm_ascend/device_allocator/camem.py @@ -144,7 +144,7 @@ def get_instance() -> "CaMemAllocator": return CaMemAllocator.instance def __init__(self): - conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") + conf = os.environ.get("PYTORCH_NPU_ALLOC_CONF", "") assert "expandable_segments:True" not in conf, \ ("Expandable segments are not compatible with memory pool. " "Please track https://github.com/pytorch/pytorch/issues/147851 " From afc8edb0460a14885ba780bf00e26afe1a9b4598 Mon Sep 17 00:00:00 2001 From: Jade Zheng Date: Tue, 17 Jun 2025 22:16:44 +0800 Subject: [PATCH 089/153] [Bugfix]: Pass scaling args to mc2 (#1202) Pass `expert_scale` and `expand_scale` args to the dispatch and combine functions. Signed-off-by: Jade Zheng --- vllm_ascend/quantization/w8a8_dynamic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 6c44a6a8bd..aeadc7b2cd 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -130,6 +130,7 @@ def fused_experts_with_mc2( "shared_expert_rank_num": 0, "moe_expert_num": moe_expert_num, "global_bs": global_bs, + "expert_scales": topk_weights.to(torch.float32), } rank = torch.distributed.get_rank() @@ -158,8 +159,8 @@ def fused_experts_with_mc2( output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2) # comm_stream.wait_stream(torch.npu.current_stream()) - expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[ - 0:5] + expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts, _, expand_scales = output[ + 0:7] if shared_experts is not None: with npu_stream_switch("moe_secondary", 0): @@ -187,6 +188,7 @@ def fused_experts_with_mc2( "shared_expert_rank_num": 0, "moe_expert_num": moe_expert_num, "global_bs": 0, + "expand_scales": expand_scales, } tp_recv_counts = torch.empty(1, dtype=torch.int32, From d7e19ed57a8b666855720d68185e140094890167 Mon Sep 17 00:00:00 2001 From: whx <56632993+whx-sjtu@users.noreply.github.com> Date: Tue, 17 Jun 2025 23:14:25 +0800 Subject: [PATCH 090/153] [BugFix] fix length of sin/cos cache in rope (#1266) This PR fixes the bug that constructs shorter sin/cos cache than model's max positional embedding. Closes: https://github.com/vllm-project/vllm-ascend/issues/1038 Signed-off-by: whx-sjtu <2952154980@qq.com> --- vllm_ascend/ops/rotary_embedding.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 9f8ae784c5..39a4c1cfe8 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -219,7 +219,9 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask self.register_buffer("inv_freq", inv_freq, persistent=False) - t = torch.arange(seq_len, device=device, dtype=torch.float32) + t = torch.arange(seq_len * self.scaling_factor, + device=device, + dtype=torch.float32) freqs = torch.outer(t, inv_freq) cos_cached = torch.cat([freqs, freqs], dim=-1).cos() * self.mscale From 1193c97ec15214aef70268bd79f94d6f5b2df964 Mon Sep 17 00:00:00 2001 From: raindaywhu Date: Tue, 17 Jun 2025 23:47:49 +0800 Subject: [PATCH 091/153] default 10 turns to wait worker finished --- vllm_ascend/eplb/eplb_updator.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 827ab3ead2..c62a87084e 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -46,6 +46,9 @@ def init_eplb(self, redundant_enable): self.cur_iterations: torch.int64 = 0 + self.wait_worker_iterations: torch.int64 = 0 + self.num_wait_worker_iterations: torch.int64 = 10 + self.planner_block_queue = Queue() self.block_update_queue = Queue(maxsize=1) @@ -64,7 +67,7 @@ def init_eplb(self, redundant_enable): planner_q = self.planner_block_queue, block_update_q = self.block_update_queue, redundant_enable = self.redundant_enable, - policy_type = 0, + policy_type = 2, enable_d2d = True ) @@ -92,10 +95,12 @@ def forward_before(self): self.get_init_expert_map() # Batch after eplb process being triggered, get update info provided by eplb process - if self.update_in_flight and self.weight_update_counter == 0: + if self.update_in_flight and self.weight_update_counter == 0 and self.wait_worker_iterations == self.num_wait_worker_iterations: + self.wait_worker_iterations = 0 self.update_info_all = self.block_update_queue.get() + self.weight_loading = True - if self.update_in_flight and self.weight_update_counter < self.num_moe_layers: + if self.update_in_flight and self.weight_loading and self.weight_update_counter < self.num_moe_layers: (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(0) rank_id = torch.distributed.get_rank() self.eplb_loader.set_log2phy_map(log2phy_map) @@ -119,6 +124,11 @@ def forward_end(self): moe_load = self.compute_and_set_moe_load() self.wakeup_eplb_worker() self.update_in_flight = True + self.wait_worker_iterations = 0 + self.weight_loading = False + + if self.update_in_flight: + self.wait_worker_iterations = self.wait_worker_iterations + 1 self.eplb_loader.update_expert_map_and_weight(self.reqs, self.redundant_enable) From db2f630aebb0cad44f9705ac028993233a00c82e Mon Sep 17 00:00:00 2001 From: zzzzwwjj <34335947+zzzzwwjj@users.noreply.github.com> Date: Wed, 18 Jun 2025 00:58:38 +0800 Subject: [PATCH 092/153] [bugfix] fix deepseek with mc2 (#1268) ### What this PR does / why we need it? ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Signed-off-by: zzzzwwjj <1183291235@qq.com> --- vllm_ascend/ops/fused_moe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 4a4b4887f5..05daf69f79 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1186,7 +1186,8 @@ def forward(self, enable_force_load_balance=enable_force_load_balance, log2phy=self.log2phy, global_redundant_expert_num=self.global_redundant_expert_num, - shared_experts=shared_experts, + shared_experts=shared_experts if self.torchair_graph_enabled + and self.enable_multistream_moe and not is_prefill else None, ) if shared_experts: From 78b74803f09fb19bb02269b67d250b38921e926f Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Wed, 18 Jun 2025 15:51:38 +0800 Subject: [PATCH 093/153] fix bug in compose_expert_update_info_bipartite when adding node --- vllm_ascend/eplb/core/worker/eplb_worker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 431d63db69..123ab8a30e 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -27,7 +27,7 @@ from vllm.logger import logger from vllm_ascend.eplb.core.policy.policy_factory import PolicyFactory, DynamicConfig -from vllm_ascend.eplb.tool.eplb_utils import ExpertMapUtils +from vllm_ascend.eplb.tool.eplb_utils import ExpertMapUtils class EplbWorker: @@ -147,7 +147,7 @@ def compose_expert_update_info_bipartite(self, updated_expert_maps_org, current_ src_rank_ids = src_ranks_set[expert_id] graph_expert_update.add_nodes_from(src_rank_ids, bipartite=0) # add dest rank - graph_expert_update.add_nodes_from(str(dst_rank_id), bipartite=1) + graph_expert_update.add_node(str(dst_rank_id), bipartite=1) # add edges for src_rank_id in src_rank_ids: graph_expert_update.add_edge(src_rank_id, str(dst_rank_id)) @@ -335,7 +335,7 @@ def __init__(self, shared_dict, planner_q, block_update_q, redundant_enable, pol self.enable_d2d = enable_d2d self.planner_q = planner_q self.block_update_q = block_update_q - self.redundant_enable = redundant_enable + self.redundant_enable = redundant_enable # Create EplbWorker instance self.worker = EplbWorker(self.shared_dict, self.policy_type, self.enable_d2d) From 2cd8ecdc4f1d9ee5e24a0fa70a2b4089785f8837 Mon Sep 17 00:00:00 2001 From: Shanshan Shen <467638484@qq.com> Date: Wed, 18 Jun 2025 17:50:20 +0800 Subject: [PATCH 094/153] [Bugfix][Spec Decode] Enable `ACL_OP_INIT_MODE=1` directly only when using V0 spec decode (#1258) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? Enable `ACL_OP_INIT_MODE=1` directly only when using V0 spec decode. Find more details at **mengwei805**'s comment in https://github.com/vllm-project/vllm-ascend/pull/1123. ### Does this PR introduce _any_ user-facing change? The user will not be aware of `VLLM_ASCEND_ACL_OP_INIT_MODE` (`ACL_OP_INIT_MODE`). ### How was this patch tested? Test scripts: ```python from vllm import LLM, SamplingParams prompts = [ "The future of AI is", ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) llm = LLM( model="Qwen/Qwen2.5-1.5B-Instruct", tensor_parallel_size=1, speculative_config={ "method": "ngram", "num_speculative_tokens": 5, "prompt_lookup_max": 4, }, ) outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` Results: ``` Adding requests: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 76.70it/s] Processed prompts: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1.33it/s, est. speed input: 6.64 toks/s, output: 21.26 toks/s] Prompt: 'The future of AI is', Generated text: ' bright\n\n04/15/2020\n\nBy: James' ``` --------- Signed-off-by: shen-shanshan <467638484@qq.com> --- vllm_ascend/envs.py | 9 --------- vllm_ascend/platform.py | 6 +++--- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 74e9c1969e..02ecd6625b 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -111,15 +111,6 @@ # 1: enable moe_all2all_buffer. "MOE_ALL2ALL_BUFFER": lambda: bool(int(os.getenv("MOE_ALL2ALL_BUFFER", '0'))), - # VLLM_ASCEND_ACL_OP_INIT_MODE: - # 0: default, normal init. - # 1: delay init until launch aclops. - # 2: forbid aclops init and launch. - # Find more details at https://gitee.com/ascend/pytorch/pulls/18094 - # We set this var default to `1` in vllm-ascend to avoid segment fault when - # enable `pin_memory` while creating a tensor using `torch.tensor`. - "VLLM_ASCEND_ACL_OP_INIT_MODE": - lambda: os.getenv("VLLM_ASCEND_ACL_OP_INIT_MODE", '0'), # Some models are optimized by vllm ascend. While in some case, e.g. rlhf # training, the optimized model may not be suitable. In this case, set this # value to False to disable the optimized model. diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 5a45e9eed7..b9233da05d 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -27,7 +27,6 @@ from vllm.logger import logger from vllm.platforms import Platform, PlatformEnum -import vllm_ascend.envs as ascend_envs from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes @@ -39,8 +38,6 @@ VllmConfig = None FlexibleArgumentParser = None -os.environ["ACL_OP_INIT_MODE"] = ascend_envs.VLLM_ASCEND_ACL_OP_INIT_MODE - class NPUPlatform(Platform): @@ -188,6 +185,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if envs.VLLM_USE_V1: parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker" elif vllm_config.speculative_config: + # NOTE: We set this var to `1` in vllm-ascend to avoid segment + # fault when using spec decode with V0 engine. + os.environ["ACL_OP_INIT_MODE"] = "1" parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker" parallel_config.sd_worker_cls = "vllm_ascend.worker.worker.NPUWorker" elif vllm_config.scheduler_config.is_multi_step: From 8e6b1ee6d3f9d9c40af306ccadb1dab798c74f67 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Wed, 18 Jun 2025 19:17:25 +0800 Subject: [PATCH 095/153] improve running time in generate_expert_d2d_transfer_task --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 10 ++++------ vllm_ascend/eplb/core/loader/device_transfer_loader.py | 6 ++++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 04682fe7e6..21a0c4ab7e 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -62,14 +62,12 @@ def init_buffer_tensor_dict(self, num_buffer_tensor): self.buffer_tensor_dict[name] = torch.empty_like(expert_tensor) def get_buffer_tensor(self, buffer_tensor_id): - for name in self.expert_weight_names: - yield self.buffer_tensor_dict[name][buffer_tensor_id] + return [self.buffer_tensor_dict[name][buffer_tensor_id] for name in self.expert_weight_names] def get_expert_tensor(self, layer_id, global_expert_id_to_send): - for name in self.expert_weight_names: - complete_name = "model.layers." + str(layer_id) + ".mlp.experts." + name - local_expert_id = self.expert_map_per_layer_cpu[layer_id][global_expert_id_to_send].item() - yield self.param_dict[complete_name].data[local_expert_id] + local_expert_id = self.expert_map_per_layer_cpu[layer_id][global_expert_id_to_send].item() + return [self.param_dict["model.layers." + str(layer_id) + ".mlp.experts." + name].data[local_expert_id] + for name in self.expert_weight_names] def get_rank_expert_workload(self, num_moe_layers): return self.model.get_all_moe_loads(num_moe_layers) diff --git a/vllm_ascend/eplb/core/loader/device_transfer_loader.py b/vllm_ascend/eplb/core/loader/device_transfer_loader.py index 91affac06d..bdea81525e 100644 --- a/vllm_ascend/eplb/core/loader/device_transfer_loader.py +++ b/vllm_ascend/eplb/core/loader/device_transfer_loader.py @@ -56,13 +56,15 @@ def generate_expert_d2d_transfer_task(self, expert_send_info, expert_recv_info, self.comm_op_list = [] for send_info in expert_send_info: dst_rank, global_expert_id_to_send = send_info - for src_tensor in self.eplb_adaptor.get_expert_tensor(layer_id, global_expert_id_to_send): + src_tensors_this_expert = self.eplb_adaptor.get_expert_tensor(layer_id, global_expert_id_to_send) + for src_tensor in src_tensors_this_expert: self.comm_op_list.append(dist.P2POp(dist.isend, src_tensor, dst_rank)) buffer_tensor_id = 0 for recv_info in expert_recv_info: recv_rank, global_expert_id_to_recv = recv_info - for buffer_tensor in self.eplb_adaptor.get_buffer_tensor(buffer_tensor_id): + buffer_tensors_this_expert = self.eplb_adaptor.get_buffer_tensor(buffer_tensor_id) + for buffer_tensor in buffer_tensors_this_expert: self.comm_op_list.append(dist.P2POp(dist.irecv, buffer_tensor, recv_rank)) local_expert_to_replace = self.updated_expert_map[global_expert_id_to_recv].item() self.recv_expert_list.append((local_expert_to_replace, buffer_tensor_id)) From ebb2a70dbbdb8f55002de3313e17dfd595e1de1f Mon Sep 17 00:00:00 2001 From: songshanhu07 <1763685535@qq.com> Date: Wed, 18 Jun 2025 19:46:56 +0800 Subject: [PATCH 096/153] static EPLB fix bug, add unit test (#1186) ### What this PR does / why we need it? 1.add static EPLB unit test 2.fix bug: Tensor cannot be directly judged by if statements ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Run the unit test. --------- Signed-off-by: songshanhu07 <1763685535@qq.com> --- docs/requirements-test.txt | 2 +- requirements-dev.txt | 1 + tests/ut/ops/test_expert_load_balancer.py | 146 ++++++++++++++++++++++ vllm_ascend/quantization/w8a8_dynamic.py | 4 +- 4 files changed, 150 insertions(+), 3 deletions(-) create mode 100644 tests/ut/ops/test_expert_load_balancer.py diff --git a/docs/requirements-test.txt b/docs/requirements-test.txt index b8c6fe990f..c63a6da9cb 100644 --- a/docs/requirements-test.txt +++ b/docs/requirements-test.txt @@ -1,2 +1,2 @@ pytest-asyncio - +pytest-mock diff --git a/requirements-dev.txt b/requirements-dev.txt index 133d4604c2..c6f9241f99 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,6 +4,7 @@ modelscope openai pytest >= 6.0 pytest-asyncio +pytest-mock lm-eval ray types-jsonschema diff --git a/tests/ut/ops/test_expert_load_balancer.py b/tests/ut/ops/test_expert_load_balancer.py new file mode 100644 index 0000000000..049f916fc9 --- /dev/null +++ b/tests/ut/ops/test_expert_load_balancer.py @@ -0,0 +1,146 @@ +# fused moe ops test will hit the infer_schema error, we need add the patch +# here to make the test pass. +import vllm_ascend.patch.worker.patch_common.patch_utils # type: ignore[import] # isort: skip # noqa + +import json +from typing import List, TypedDict + +import pytest +import torch + +from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer + + +class Device(TypedDict): + device_id: int + device_expert: List[int] + + +class Layer(TypedDict): + layer_id: int + device_count: int + device_list: List[Device] + + +class MockData(TypedDict): + moe_layer_count: int + layer_list: List[Layer] + + +MOCK_DATA: MockData = { + "moe_layer_count": + 1, + "layer_list": [{ + "layer_id": + 0, + "device_count": + 2, + "device_list": [{ + "device_id": 0, + "device_expert": [7, 2, 0, 3, 5] + }, { + "device_id": 1, + "device_expert": [6, 1, 4, 7, 2] + }] + }] +} + + +@pytest.fixture +def mock_expert_load_balancer(tmp_path): + json_file = tmp_path / "expert_map.json" + with open(json_file, 'w') as f: + json.dump(MOCK_DATA, f) + + return ExpertLoadBalancer(str(json_file), global_expert_num=8) + + +def test_init(mock_expert_load_balancer): + assert isinstance(mock_expert_load_balancer.expert_map_tensor, + torch.Tensor) + assert mock_expert_load_balancer.layers_num == MOCK_DATA["moe_layer_count"] + assert mock_expert_load_balancer.ranks_num == MOCK_DATA["layer_list"][0][ + "device_count"] + + +def test_generate_index_dicts(mock_expert_load_balancer): + tensor_2d = torch.tensor([[7, 2, 0, 3, 5], [6, 1, 4, 7, 2]]) + result = mock_expert_load_balancer.generate_index_dicts(tensor_2d) + expected_result = [{ + 7: 0, + 2: 1, + 0: 2, + 3: 3, + 5: 4 + }, { + 6: 5, + 1: 6, + 4: 7, + 7: 8, + 2: 9 + }] + assert result == expected_result + + +def test_generate_expert_placement_map(mock_expert_load_balancer): + expert_placement_map = mock_expert_load_balancer.generate_expert_placement_map( + ) + assert expert_placement_map.shape == (mock_expert_load_balancer.layers_num, + mock_expert_load_balancer.ranks_num, + 8) + assert torch.all(expert_placement_map >= -1) + + +def test_generate_log2phy_expert_map(mock_expert_load_balancer): + layer_id = 0 + log2phy_map = mock_expert_load_balancer.generate_log2phy_expert_map( + layer_id) + assert log2phy_map.shape == (mock_expert_load_balancer.ranks_num, 8) + assert torch.all(log2phy_map >= -1) + + +def test_get_rank_placement_map(mock_expert_load_balancer, mocker): + mocker.patch("torch_npu.npu._lazy_init") + mocker.patch('torch.npu.current_device', return_value='cpu') + layer_id = 0 + rank_id = 0 + rank_local_expert_num, rank_expert_map = mock_expert_load_balancer.get_rank_placement_map( + layer_id, rank_id) + assert rank_local_expert_num == 5 + expected_tensor = torch.tensor([2, -1, 1, 3, -1, 4, -1, 0], + dtype=torch.int32).to( + rank_expert_map.device) + assert rank_expert_map.equal(expected_tensor) + + rank_id = 1 + rank_local_expert_num, rank_expert_map = mock_expert_load_balancer.get_rank_placement_map( + layer_id, rank_id) + expected_tensor = torch.tensor([-1, 1, 4, -1, 2, -1, 0, 3], + dtype=torch.int32).to( + rank_expert_map.device) + assert rank_expert_map.equal(expected_tensor) + + +def test_get_rank_log2phy_map(mock_expert_load_balancer): + layer_id = 0 + rank_id = 0 + log2phy_map = mock_expert_load_balancer.get_rank_log2phy_map( + layer_id, rank_id) + expected_tensor = torch.tensor([2, 6, 1, 3, 7, 4, 5, 0], + dtype=torch.int32).to(log2phy_map.device) + assert log2phy_map.equal(expected_tensor) + + rank_id = 1 + log2phy_map = mock_expert_load_balancer.get_rank_log2phy_map( + layer_id, rank_id) + expected_tensor = torch.tensor([2, 6, 9, 3, 7, 4, 5, 8], + dtype=torch.int32).to(log2phy_map.device) + assert log2phy_map.equal(expected_tensor) + + +def test_get_global_redundant_expert_num(mock_expert_load_balancer): + redundant_expert_num = mock_expert_load_balancer.get_global_redundant_expert_num( + ) + expected_redundant_expert_num = len(MOCK_DATA["layer_list"][0]["device_list"][0]["device_expert"]) * \ + MOCK_DATA["layer_list"][0]["device_count"] - 8 + assert redundant_expert_num == expected_redundant_expert_num diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index aeadc7b2cd..372c29bca7 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -118,7 +118,7 @@ def fused_experts_with_mc2( global_redundant_expert_num: int = 0, shared_experts: Optional[Any] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - if log2phy: + if log2phy is not None: topk_ids = log2phy[topk_ids] global_bs = 0 moe_expert_num = len(expert_map) + global_redundant_expert_num @@ -233,7 +233,7 @@ def fused_experts_with_all2all( log2phy: torch.Tensor = None, global_redundant_expert_num: int = 0, ): - if log2phy: + if log2phy is not None: topk_ids = log2phy[topk_ids] original_shape = hidden_states.shape if len(original_shape) == 3: From 43def8a646b27255846943ebc9d114d856f4ad03 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Wed, 18 Jun 2025 20:33:26 +0800 Subject: [PATCH 097/153] add warm up & batch add --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 2 +- vllm_ascend/eplb/eplb_updator.py | 33 +++++++++++++++++++++--- vllm_ascend/models/deepseek_v2.py | 32 +++++++++++++++-------- vllm_ascend/ops/fused_moe.py | 15 ----------- vllm_ascend/worker/model_runner_v1.py | 10 ++++--- 5 files changed, 59 insertions(+), 33 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 21a0c4ab7e..f575a2c579 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -70,7 +70,7 @@ def get_expert_tensor(self, layer_id, global_expert_id_to_send): for name in self.expert_weight_names] def get_rank_expert_workload(self, num_moe_layers): - return self.model.get_all_moe_loads(num_moe_layers) + return self.model.get_all_moe_loads(num_moe_layers, self.global_expert_num) def get_init_expert_map(self, num_moe_layers): expert_map = self.model.get_all_expert_map(num_moe_layers) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index c62a87084e..e2c1080e4a 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -138,13 +138,12 @@ def compute_and_set_moe_load(self): self._gather_buffer = None if dist.is_initialized(): self.world_size = dist.get_world_size() - if dist.is_initialized(): - device = local_load.device + self.device = local_load.device if self._gather_buffer is None: shape = (self.world_size, *local_load.shape) self._gather_buffer = torch.empty(shape, dtype=local_load.dtype, - device=device) + device=self.device) dist.all_gather_into_tensor(self._gather_buffer, local_load) @@ -157,6 +156,34 @@ def compute_and_set_moe_load(self): logger.debug(f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}") return moe_load + def warm_up_eplb(self): + + self.compute_and_set_moe_load() + + src_tensor = torch.empty((1,), device=self.device) + self_rank = dist.get_rank() + + comm_op_list = [] + + for dst_rank in range(self.world_size): + if dst_rank == self_rank: + continue + comm_op_list.append( + dist.P2POp(dist.isend, src_tensor, dst_rank) + ) + + for src_rank in range(self.world_size): + if src_rank == self_rank: + continue + comm_op_list.append( + dist.P2POp(dist.irecv, src_tensor, src_rank) + ) + if comm_op_list: + reqs = dist.batch_isend_irecv(comm_op_list) + + for req in reqs: + req.wait() + def shutdown(self): """ Clean up the EPLB process. diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index b032dac6a3..4fce5fd2cb 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -810,19 +810,29 @@ def get_all_expert_map(self,num_moe_layers): return torch.stack(all_loads, dim=0) - def get_moe_load(self,layer_id): - return self.model.layers[layer_id].mlp.experts.get_moe_load() + def get_all_moe_loads( + self, + num_moe_layers: int, + num_experts_per_layer: int + ) -> torch.Tensor: + # 收集各层 topk_ids -> list of [B, K] + all_topk_ids = [self.get_topk_ids(i) for i in range(num_moe_layers)] + # stack & flatten -> ids2d: [L, B*K] + stacked = torch.stack(all_topk_ids, dim=0) # [L, B, K] + L, B, K = stacked.shape + ids2d = stacked.view(L, B * K).to(torch.int64) # [L, N] - def get_all_moe_loads(self, num_moe_layers) -> torch.Tensor: - """ - output: [num_moe_layers, num_experts_per_layer] - """ - all_loads = [] - for layer_id in range(num_moe_layers): - load_tensor = self.get_moe_load(3+layer_id) - all_loads.append(load_tensor) + device = ids2d.device + moe_load = torch.zeros((L, num_experts_per_layer), + dtype=torch.int64, device=device) - return torch.stack(all_loads, dim=0) + ones2d = torch.ones_like(ids2d, dtype=torch.int64) + + assert moe_load.dim() == 2 and ids2d.dim() == 2 and ones2d.dim() == 2 + assert ids2d.shape == ones2d.shape + + moe_load.scatter_add_(dim=1, index=ids2d, src=ones2d) + return moe_load class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM): pass diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 84bcc9dc86..aeb5ae7623 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1251,18 +1251,3 @@ def get_map(self): def get_log2phy_map(self): return self.log2phy - def get_moe_load(self): - return self.moe_load - - def calculate_moe_load(self): - if self.moe_load is None: - self.moe_load = torch.zeros(self.num_experts, - dtype=torch.int64, - device=self.topk_ids.device) - - ids = self.topk_ids.flatten().to(torch.int64) - - ones = torch.ones_like(ids, dtype=torch.int64, device=ids.device) - self.moe_load.scatter_add_(0, ids, ones) - - return self.moe_load diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index d56e8e0682..820f1e000a 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1217,9 +1217,6 @@ def execute_model( ) -> Union[ModelRunnerOutput, torch.Tensor]: if self.dynamic_eplb: - if self.eplb_adaptor == None: - self.eplb_adaptor = VllmEplbAdaptor(model=self.model) - self.eplb_updator.set_adaptor(self.eplb_adaptor) self.eplb_updator.forward_before() with ProfileExecuteDuration().capture_async( @@ -1510,6 +1507,13 @@ def _dummy_run( positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds) + + #EPLB + if self.dynamic_eplb == True: + self.eplb_adaptor = VllmEplbAdaptor(model=self.model) + self.eplb_updator.set_adaptor(self.eplb_adaptor) + self.eplb_updator.warm_up_eplb() + return hidden_states def profile_run(self) -> None: From 9219cc80daf6cf8b49a7f3a267028625ef1f23a4 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Wed, 18 Jun 2025 20:43:02 +0800 Subject: [PATCH 098/153] delete layer moe load --- vllm_ascend/ops/fused_moe.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index aeb5ae7623..732c5314cc 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1182,10 +1182,6 @@ def forward(self, else: return e_hidden_states - #EPLB calculate moe load - if self.dynamic_eplb == True: - self.calculate_moe_load() - # if self.enable_multistream_shared_expert and not is_prefill: # hidden_states, shared_output = hidden_states From 2125fe032e3b4d09ab7d4eebea9aff259ef191f5 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Wed, 18 Jun 2025 21:01:23 +0800 Subject: [PATCH 099/153] add get_tok_ids --- vllm_ascend/models/deepseek_v2.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 4fce5fd2cb..eb4cd1b681 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -810,6 +810,9 @@ def get_all_expert_map(self,num_moe_layers): return torch.stack(all_loads, dim=0) + def get_topk_ids(self,layer_id): + return self.model.layers[layer_id+3].mlp.experts.topk_ids + def get_all_moe_loads( self, num_moe_layers: int, From 4bda9ba31c9a3ff7a3857659328cb6f06bcad5ae Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Wed, 18 Jun 2025 21:29:30 +0800 Subject: [PATCH 100/153] Extract cal_moe_load from deepseek_v2 --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 24 ++++++++++++++++++++++-- vllm_ascend/models/deepseek_v2.py | 24 ------------------------ 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index f575a2c579..aff10b2e75 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -69,8 +69,28 @@ def get_expert_tensor(self, layer_id, global_expert_id_to_send): return [self.param_dict["model.layers." + str(layer_id) + ".mlp.experts." + name].data[local_expert_id] for name in self.expert_weight_names] - def get_rank_expert_workload(self, num_moe_layers): - return self.model.get_all_moe_loads(num_moe_layers, self.global_expert_num) + def get_rank_expert_workload( + self, + num_moe_layers: int, + ) -> torch.Tensor: + # 收集各层 topk_ids -> list of [B, K] + all_topk_ids = [self.model.get_topk_ids(i) for i in range(num_moe_layers)] + # stack & flatten -> ids2d: [L, B*K] + stacked = torch.stack(all_topk_ids, dim=0) # [L, B, K] + L, B, K = stacked.shape + ids2d = stacked.view(L, B * K).to(torch.int64) # [L, N] + + device = ids2d.device + moe_load = torch.zeros((L, self.global_expert_num), + dtype=torch.int64, device=device) + + ones2d = torch.ones_like(ids2d, dtype=torch.int64) + + assert moe_load.dim() == 2 and ids2d.dim() == 2 and ones2d.dim() == 2 + assert ids2d.shape == ones2d.shape + + moe_load.scatter_add_(dim=1, index=ids2d, src=ones2d) + return moe_load def get_init_expert_map(self, num_moe_layers): expert_map = self.model.get_all_expert_map(num_moe_layers) diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index eb4cd1b681..7e4ff2f364 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -812,30 +812,6 @@ def get_all_expert_map(self,num_moe_layers): def get_topk_ids(self,layer_id): return self.model.layers[layer_id+3].mlp.experts.topk_ids - - def get_all_moe_loads( - self, - num_moe_layers: int, - num_experts_per_layer: int - ) -> torch.Tensor: - # 收集各层 topk_ids -> list of [B, K] - all_topk_ids = [self.get_topk_ids(i) for i in range(num_moe_layers)] - # stack & flatten -> ids2d: [L, B*K] - stacked = torch.stack(all_topk_ids, dim=0) # [L, B, K] - L, B, K = stacked.shape - ids2d = stacked.view(L, B * K).to(torch.int64) # [L, N] - - device = ids2d.device - moe_load = torch.zeros((L, num_experts_per_layer), - dtype=torch.int64, device=device) - - ones2d = torch.ones_like(ids2d, dtype=torch.int64) - - assert moe_load.dim() == 2 and ids2d.dim() == 2 and ones2d.dim() == 2 - assert ids2d.shape == ones2d.shape - - moe_load.scatter_add_(dim=1, index=ids2d, src=ones2d) - return moe_load class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM): pass From 1b78fb20dcf85eb6e466219dbd9b2a7d3b624e5e Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Thu, 19 Jun 2025 16:37:10 +0800 Subject: [PATCH 101/153] running time reduction forward_before and forward_end --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 42 ++++++++++++------- .../core/loader/device_transfer_loader.py | 7 ++-- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index aff10b2e75..8e00bffd28 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -45,29 +45,38 @@ def __init__(self, model, **args): self.expert_map_per_layer[self.num_dense_layers + layer_idx] =\ self.model.get_expert_map(self.num_dense_layers + layer_idx) - self.buffer_tensor_dict = dict() # TODO: here we set number of buffer tensor equal to number of expert in each laryer, which can be improved num_buffer_tensor = torch.where(self.expert_map_per_layer[self.num_dense_layers] != -1)[0].numel() - self.init_buffer_tensor_dict(num_buffer_tensor) + self.buffer_tensor_list = [[] for _ in range(num_buffer_tensor)] + self.init_buffer_tensor(num_buffer_tensor) + + self.expert_param_per_layer = dict() + self.init_expert_param_per_layer() self.log2phy_map_per_layer = dict() for layer_idx in range(self.num_moe_layers): self.log2phy_map_per_layer[self.num_dense_layers + layer_idx] =\ self.model.get_log2phy_map(self.num_dense_layers + layer_idx) - def init_buffer_tensor_dict(self, num_buffer_tensor): + def init_buffer_tensor(self, num_buffer_tensor): for name in self.expert_weight_names: complete_name = "model.layers." + str(self.num_dense_layers) + ".mlp.experts." + name expert_tensor = self.param_dict[complete_name].data[0:num_buffer_tensor] - self.buffer_tensor_dict[name] = torch.empty_like(expert_tensor) - - def get_buffer_tensor(self, buffer_tensor_id): - return [self.buffer_tensor_dict[name][buffer_tensor_id] for name in self.expert_weight_names] - - def get_expert_tensor(self, layer_id, global_expert_id_to_send): - local_expert_id = self.expert_map_per_layer_cpu[layer_id][global_expert_id_to_send].item() - return [self.param_dict["model.layers." + str(layer_id) + ".mlp.experts." + name].data[local_expert_id] - for name in self.expert_weight_names] + buffer_tensors = torch.empty_like(expert_tensor) + for buffer_id in range(num_buffer_tensor): + self.buffer_tensor_list[buffer_id].append(buffer_tensors[buffer_id]) + + def init_expert_param_per_layer(self): + num_local_expert = self.param_dict["model.layers." + str(self.num_dense_layers) +\ + ".mlp.experts." + self.expert_weight_names[0]].data.shape[0] + for moe_layer_id in range(self.num_moe_layers): + layer_idx = self.num_dense_layers + moe_layer_id + self.expert_param_per_layer[layer_idx] = list() + for local_expert_id in range(num_local_expert): + self.expert_param_per_layer[layer_idx].append( + [self.param_dict["model.layers." + str(layer_idx) + ".mlp.experts." + name].data[local_expert_id] + for name in self.expert_weight_names] + ) def get_rank_expert_workload( self, @@ -117,10 +126,11 @@ def do_update_expert_map(self, layer_id, updated_expert_map): self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map) def do_update_expert_weight(self, layer_id, local_expert_to_replace, buffer_tensor_id): - for name in self.expert_weight_names: - complete_name = "model.layers." + str(layer_id) + ".mlp.experts." + name - expert_tensor = self.param_dict[complete_name].data[local_expert_to_replace] - expert_tensor.copy_(self.buffer_tensor_dict[name][buffer_tensor_id]) + for expert_tensor, buffer_tensor in zip( + self.expert_param_per_layer[layer_id][local_expert_to_replace], + self.buffer_tensor_list[buffer_tensor_id] + ): + expert_tensor.copy_(buffer_tensor) def do_update_log2phy_map(self, layer_id, updated_log2phy_map): if self.log2phy_map_per_layer[layer_id] is not None: diff --git a/vllm_ascend/eplb/core/loader/device_transfer_loader.py b/vllm_ascend/eplb/core/loader/device_transfer_loader.py index bdea81525e..20ff7e114b 100644 --- a/vllm_ascend/eplb/core/loader/device_transfer_loader.py +++ b/vllm_ascend/eplb/core/loader/device_transfer_loader.py @@ -56,15 +56,14 @@ def generate_expert_d2d_transfer_task(self, expert_send_info, expert_recv_info, self.comm_op_list = [] for send_info in expert_send_info: dst_rank, global_expert_id_to_send = send_info - src_tensors_this_expert = self.eplb_adaptor.get_expert_tensor(layer_id, global_expert_id_to_send) - for src_tensor in src_tensors_this_expert: + local_expert_id = self.eplb_adaptor.expert_map_per_layer_cpu[layer_id][global_expert_id_to_send].item() + for src_tensor in self.eplb_adaptor.expert_param_per_layer[layer_id][local_expert_id]: self.comm_op_list.append(dist.P2POp(dist.isend, src_tensor, dst_rank)) buffer_tensor_id = 0 for recv_info in expert_recv_info: recv_rank, global_expert_id_to_recv = recv_info - buffer_tensors_this_expert = self.eplb_adaptor.get_buffer_tensor(buffer_tensor_id) - for buffer_tensor in buffer_tensors_this_expert: + for buffer_tensor in self.eplb_adaptor.buffer_tensor_list[buffer_tensor_id]: self.comm_op_list.append(dist.P2POp(dist.irecv, buffer_tensor, recv_rank)) local_expert_to_replace = self.updated_expert_map[global_expert_id_to_recv].item() self.recv_expert_list.append((local_expert_to_replace, buffer_tensor_id)) From e4b1ba0d958d62f9f19d08323fccd261fc012aad Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Thu, 19 Jun 2025 19:40:15 +0800 Subject: [PATCH 102/153] packed update info and put/get --- vllm_ascend/eplb/core/worker/eplb_worker.py | 41 ++++++++++++++++++++- vllm_ascend/eplb/eplb_updator.py | 24 +++++++++++- 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 123ab8a30e..93b178d59e 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -32,12 +32,14 @@ class EplbWorker: - def __init__(self, shared_dict, policy_type, enable_d2d: bool = True): + def __init__(self, shared_dict, policy_type, enable_d2d: bool = True, redundant_enable=0): self.policy_type = policy_type self.policy = PolicyFactory.generate_policy(policy_type, DynamicConfig()) self.shared_dict = shared_dict self.old_expert_maps = None self.enable_d2d = enable_d2d + self.redundant_enable = redundant_enable + self.rank_id = dist.get_rank() def do_update(self): # put data in to queue @@ -75,7 +77,9 @@ def do_update(self): self.old_expert_maps = new_expert_maps logger.info("EPLB Process compute complete") - return update_info + packed_update_info = self.pack_update_info(update_info) + + return packed_update_info def check_expert_placement(self, old_placement, new_placement): num_layers = old_placement.shape[0] @@ -322,6 +326,39 @@ def local2global(self, return placement_global + def pack_update_info(self, update_info_generator): + """ + Pack a list of update info tuples for efficient IPC. + """ + send_all = [] + recv_all = [] + maps = [] + log2phy_all = [] + layer_ids = [] + + for send_info, recv_info, new_expert_map, layer_id in update_info_generator: + + send_all.append(send_info) + recv_all.append(recv_info) + + maps.append(new_expert_map[self.rank_id]) + + log2phy_map = ExpertMapUtils.generate_log2phy_map(new_expert_map) if self.redundant_enable else None + log2phy_all.append(log2phy_map) + + layer_ids.append(layer_id) + + # 把 list of Tensor 堆成一个大 Tensor + stacked_maps = torch.stack(maps, dim=0) # [N, ...] + stacked_log2phy = torch.stack(log2phy_all, dim=0) # [N, ...] + layer_id_tensor = torch.as_tensor(layer_ids, dtype=torch.int64) # [N] + + # 跨进程零拷贝 + for t in (stacked_maps, stacked_log2phy, layer_id_tensor): + t.share_memory_() + + return send_all, recv_all, stacked_maps, stacked_log2phy, layer_id_tensor + class EplbProcess: def __init__(self, shared_dict, planner_q, block_update_q, redundant_enable, policy_type: int = 0, enable_d2d: bool = True): """ diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index e2c1080e4a..e4953d41a3 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -97,7 +97,8 @@ def forward_before(self): # Batch after eplb process being triggered, get update info provided by eplb process if self.update_in_flight and self.weight_update_counter == 0 and self.wait_worker_iterations == self.num_wait_worker_iterations: self.wait_worker_iterations = 0 - self.update_info_all = self.block_update_queue.get() + packed_update_info = self.block_update_queue.get() + self.update_info_all = self.unpack_update_batch(packed_update_info) self.weight_loading = True if self.update_in_flight and self.weight_loading and self.weight_update_counter < self.num_moe_layers: @@ -108,7 +109,7 @@ def forward_before(self): expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] #logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") self.eplb_loader.generate_expert_d2d_transfer_task(expert_send_info_this_rank, - expert_recv_info_this_rank, updated_expert_map[rank_id], layer_id + 3) + expert_recv_info_this_rank, updated_expert_map, layer_id + 3) self.weight_update_counter += 1 if self.weight_update_counter == self.num_moe_layers: self.weight_update_counter = 0 @@ -184,6 +185,25 @@ def warm_up_eplb(self): for req in reqs: req.wait() + def unpack_update_batch(self, packed_update_info): + """ + Unpack the IPC batch back into original update_info_list. + """ + send_all, recv_all, stacked_maps, stacked_log2phy, layer_id_tensor = packed_update_info + + # 拆分 Tensor,得到 N 个张量的 tuple + maps = stacked_maps.unbind(0) + log2phy = stacked_log2phy.unbind(0) + + # 把 layer_id_tensor 转成 Python int 列表 + layer_ids = layer_id_tensor.tolist() + + recovered = [ + (s, r, m, l, lid) + for s, r, m, l, lid in zip(send_all, recv_all, maps, log2phy, layer_ids) + ] + return recovered + def shutdown(self): """ Clean up the EPLB process. From a9584dd912e336c5f12c2d414f3ece031682a245 Mon Sep 17 00:00:00 2001 From: lt Date: Thu, 19 Jun 2025 20:13:26 +0800 Subject: [PATCH 103/153] add get expert workload --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 2 +- vllm_ascend/eplb/eplb_updator.py | 32 ++++++++++++++++-------- vllm_ascend/worker/model_runner_v1.py | 3 +++ vllm_ascend/worker/worker_v1.py | 5 ++++ 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index aff10b2e75..71f6a27e73 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -71,7 +71,7 @@ def get_expert_tensor(self, layer_id, global_expert_id_to_send): def get_rank_expert_workload( self, - num_moe_layers: int, + num_moe_layers: int, ) -> torch.Tensor: # 收集各层 topk_ids -> list of [B, K] all_topk_ids = [self.model.get_topk_ids(i) for i in range(num_moe_layers)] diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index e2c1080e4a..22adcb7552 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -22,6 +22,7 @@ from vllm_ascend.eplb.core.worker.eplb_worker import EplbProcess from vllm_ascend.eplb.core.loader.device_transfer_loader import D2DExpertWeightLoader + class EplbUpdator: def __init__(self, redundant_enable): @@ -34,7 +35,7 @@ def set_adaptor(self, adaptor): def init_eplb(self, redundant_enable): - self.redundant_enable = redundant_enable + self.redundant_enable = redundant_enable self.num_iterations: torch.int64 = 130 self.weight_update_counter = 0 @@ -63,18 +64,25 @@ def init_eplb(self, redundant_enable): }) self.eplb = EplbProcess( - shared_dict = self.shared_dict, - planner_q = self.planner_block_queue, - block_update_q = self.block_update_queue, - redundant_enable = self.redundant_enable, - policy_type = 2, - enable_d2d = True + shared_dict=self.shared_dict, + planner_q=self.planner_block_queue, + block_update_q=self.block_update_queue, + redundant_enable=self.redundant_enable, + policy_type=2, + enable_d2d=True ) self.eplb_process = self.eplb._launch_process() + # todo - 新增 eplb 周期统计 + + logger.info(f"[ModelRunner] Launched EPLB process (pid={self.eplb_process.pid})") + def get_expert_load(self) -> str: + """todo 确认moe_load的值是什么类型""" + # return '{"a":"b"}' # mock + return self.shared_dict['moe_load'] def get_update_iteration(self): self.cur_iterations = self.cur_iterations + 1 @@ -101,14 +109,16 @@ def forward_before(self): self.weight_loading = True if self.update_in_flight and self.weight_loading and self.weight_update_counter < self.num_moe_layers: - (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(0) + (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop( + 0) rank_id = torch.distributed.get_rank() self.eplb_loader.set_log2phy_map(log2phy_map) expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] - #logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") + # logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") self.eplb_loader.generate_expert_d2d_transfer_task(expert_send_info_this_rank, - expert_recv_info_this_rank, updated_expert_map[rank_id], layer_id + 3) + expert_recv_info_this_rank, updated_expert_map[rank_id], + layer_id + 3) self.weight_update_counter += 1 if self.weight_update_counter == self.num_moe_layers: self.weight_update_counter = 0 @@ -177,7 +187,7 @@ def warm_up_eplb(self): continue comm_op_list.append( dist.P2POp(dist.irecv, src_tensor, src_rank) - ) + ) if comm_op_list: reqs = dist.batch_isend_irecv(comm_op_list) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 820f1e000a..90c99275a2 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1422,6 +1422,9 @@ def _profile_multimodal(self) -> None: # Cache the dummy encoder outputs. self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) + def do_get_expert_load(self) -> str: + return self.eplb_updator.get_expert_load() + @torch.inference_mode() def _dummy_run( self, diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index ba29dcfc56..e89dfe91e0 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -180,6 +180,11 @@ def execute_model( output = self.model_runner.execute_model(scheduler_output) return output if self.is_driver_worker else None + def get_expert_load(self) -> str: + """ todo 一共几个worker""" + moe_load = self.model_runner.do_get_expert_load() + return moe_load + def load_model(self) -> None: if self.vllm_config.model_config.enable_sleep_mode: allocator = CaMemAllocator.get_instance() From 6592d720c7282abeb8650b9a50af0854b4911f93 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Thu, 19 Jun 2025 20:36:24 +0800 Subject: [PATCH 104/153] fix bug in pack update info --- vllm_ascend/eplb/core/worker/eplb_worker.py | 31 ++++++++++----------- vllm_ascend/eplb/eplb_updator.py | 24 ++++++++++------ 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 93b178d59e..412fb1136a 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -343,19 +343,23 @@ def pack_update_info(self, update_info_generator): maps.append(new_expert_map[self.rank_id]) - log2phy_map = ExpertMapUtils.generate_log2phy_map(new_expert_map) if self.redundant_enable else None - log2phy_all.append(log2phy_map) + if self.redundant_enable is not None: + log2phy_map = ExpertMapUtils.generate_log2phy_map(new_expert_map) + log2phy_all.append(log2phy_map) layer_ids.append(layer_id) # 把 list of Tensor 堆成一个大 Tensor - stacked_maps = torch.stack(maps, dim=0) # [N, ...] - stacked_log2phy = torch.stack(log2phy_all, dim=0) # [N, ...] - layer_id_tensor = torch.as_tensor(layer_ids, dtype=torch.int64) # [N] + stacked_maps = torch.stack(maps, dim=0) + layer_id_tensor = torch.as_tensor(layer_ids, dtype=torch.int64) + stacked_maps.share_memory_() + layer_id_tensor.share_memory_() - # 跨进程零拷贝 - for t in (stacked_maps, stacked_log2phy, layer_id_tensor): - t.share_memory_() + if self.redundant_enable: + stacked_log2phy = torch.stack(log2phy_all, dim=0) + stacked_log2phy.share_memory_() + else: + stacked_log2phy = None return send_all, recv_all, stacked_maps, stacked_log2phy, layer_id_tensor @@ -375,7 +379,7 @@ def __init__(self, shared_dict, planner_q, block_update_q, redundant_enable, pol self.redundant_enable = redundant_enable # Create EplbWorker instance - self.worker = EplbWorker(self.shared_dict, self.policy_type, self.enable_d2d) + self.worker = EplbWorker(self.shared_dict, self.policy_type, self.enable_d2d, self.redundant_enable) def worker_process(self, planner_q, block_update_q): @@ -387,17 +391,12 @@ def worker_process(self, planner_q, block_update_q): planner_q.get() - update_info_generator = self.worker.do_update() - update_info_list = [] - - for (send_info , recv_info , new_expert_map, layer_id) in update_info_generator: - log2phy_map = ExpertMapUtils.generate_log2phy_map(new_expert_map) if self.redundant_enable else None - update_info_list.append((send_info , recv_info , new_expert_map, log2phy_map, layer_id)) + packed_update_info = self.worker.do_update() while True: if not block_update_q.empty(): continue - block_update_q.put(update_info_list) + block_update_q.put(packed_update_info) break except Exception as e: diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index e4953d41a3..c6f087cdbb 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -75,7 +75,6 @@ def init_eplb(self, redundant_enable): logger.info(f"[ModelRunner] Launched EPLB process (pid={self.eplb_process.pid})") - def get_update_iteration(self): self.cur_iterations = self.cur_iterations + 1 return self.cur_iterations % self.num_iterations == 0 @@ -191,16 +190,25 @@ def unpack_update_batch(self, packed_update_info): """ send_all, recv_all, stacked_maps, stacked_log2phy, layer_id_tensor = packed_update_info - # 拆分 Tensor,得到 N 个张量的 tuple - maps = stacked_maps.unbind(0) - log2phy = stacked_log2phy.unbind(0) - - # 把 layer_id_tensor 转成 Python int 列表 + maps = stacked_maps.unbind(0) layer_ids = layer_id_tensor.tolist() + if self.redundant_enable: + log2phy_list = stacked_log2phy.unbind(0) + else: + log2phy_list = [None] * len(maps) + + _zip = zip + _send = send_all + _recv = recv_all + _maps = maps + _l2p = log2phy_list + _lids = layer_ids + recovered = [ - (s, r, m, l, lid) - for s, r, m, l, lid in zip(send_all, recv_all, maps, log2phy, layer_ids) + (_s, _r, _m, _lp, _lid) + for _s, _r, _m, _lp, _lid + in _zip(_send, _recv, _maps, _l2p, _lids) ] return recovered From 17fc31eb083d31cae6b1aec7135377a90b357aaf Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Thu, 19 Jun 2025 21:20:56 +0800 Subject: [PATCH 105/153] improve implementation of generate_log2phy_map --- vllm_ascend/eplb/tool/eplb_utils.py | 43 ++++++++++++++--------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/vllm_ascend/eplb/tool/eplb_utils.py b/vllm_ascend/eplb/tool/eplb_utils.py index e36a97b45a..527ac02cef 100644 --- a/vllm_ascend/eplb/tool/eplb_utils.py +++ b/vllm_ascend/eplb/tool/eplb_utils.py @@ -38,28 +38,27 @@ def generate_index_dicts(cls, tensor_2d): @classmethod def generate_log2phy_map(cls, expert_map): num_local_experts = expert_map.max() + 1 - ranks_num, global_expert_num = expert_map.shape - local_expert_map = cls.global2local(expert_map, num_local_experts) - concatenated = torch.flatten(local_expert_map) - rank_expert_to_global = cls.generate_index_dicts(local_expert_map) - result_dict: Dict[int, List[int]] = {} - for idx, value in enumerate(concatenated): - key = value.item() - if key not in result_dict: - result_dict[key] = [] - result_dict[key].append(idx) - - log2phy_map = torch.full((ranks_num, global_expert_num), - -1, - dtype=torch.int32) - for rank in range(ranks_num): - for key in result_dict: - indices_in_concat = result_dict[key] - if key in rank_expert_to_global[rank]: - log2phy_map[rank][key] = rank_expert_to_global[rank][key] - else: - chosen_index = random.choice(indices_in_concat) - log2phy_map[rank][key] = chosen_index + log2phy_map = expert_map.clone() + num_ranks, num_global_expert = log2phy_map.shape + + row_indices = torch.arange(num_ranks).view(-1, 1).expand(num_ranks,\ + num_global_expert) * num_local_experts + log2phy_map[log2phy_map != -1] += row_indices[log2phy_map != -1] + + for idx in range(num_global_expert): + positive_rank_idx = torch.where(log2phy_map[:, idx] != -1)[0] + negative_rank_idx = torch.where(log2phy_map[:, idx] == -1)[0] + num_rank_holding_expert = positive_rank_idx.size(0) + + if num_rank_holding_expert == 1: + log2phy_map[negative_rank_idx, idx] = torch.full((num_ranks - 1,), + log2phy_map[positive_rank_idx, idx].item(), + dtype=log2phy_map.dtype) + else: + random_list = [random.choice(log2phy_map[positive_rank_idx, idx]) + for _ in range(num_ranks - num_rank_holding_expert)] + log2phy_map[negative_rank_idx, idx] = torch.tensor(random_list) + return log2phy_map @classmethod From b350edae9a550caf7b9165bcb7df0860c2c84e3f Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Fri, 20 Jun 2025 01:02:52 +0800 Subject: [PATCH 106/153] [UT] refactor test_expert_load_balancer and fix broken CI (#1293) refactor test_expert_load_balancer to keep the ut code style This PR also fixed the break change from https://github.com/vllm-project/vllm/pull/16188/files#diff-e2942ece30a5c580437694ffb964bfc664b510c59244c08e5921b8f5cefb4280 This is just a quick fix. We'll support embedding on V1 later Closes: https://github.com/vllm-project/vllm-ascend/issues/1299 Signed-off-by: wangxiyuan --- .../singlecard/core/test_ascend_scheduler.py | 76 ++++--- tests/e2e/singlecard/test_scheduler.py | 24 ++- tests/ut/ops/test_expert_load_balancer.py | 199 +++++++++--------- vllm_ascend/worker/model_runner_v1.py | 70 ++++-- 4 files changed, 217 insertions(+), 152 deletions(-) diff --git a/tests/e2e/singlecard/core/test_ascend_scheduler.py b/tests/e2e/singlecard/core/test_ascend_scheduler.py index c382ebdf40..7d9c1b1ef5 100644 --- a/tests/e2e/singlecard/core/test_ascend_scheduler.py +++ b/tests/e2e/singlecard/core/test_ascend_scheduler.py @@ -98,11 +98,7 @@ def create_scheduler( ) kv_cache_config = KVCacheConfig( num_blocks=num_blocks, # A large number of blocks to hold all requests - **({ - "tensors": {} - } if vllm_version_is("0.9.0") else { - "kv_cache_tensors": [] - }), + kv_cache_tensors=[], kv_cache_groups=[ KVCacheGroupSpec(['layer'], FullAttentionSpec(block_size, 1, 1, torch.float32, @@ -145,8 +141,8 @@ def create_requests(num_requests: int, multi_modal_hashes=None, eos_token_id=EOS_TOKEN_ID, **({ - "arrival_time": 0.0 - } if vllm_version_is("0.9.0") else {}), + "pooling_params": None + } if not vllm_version_is("0.9.1") else {}), ) requests.append(request) return requests @@ -262,7 +258,9 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - ) + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) scheduler.update_from_output(output, model_runner_output) # Schedule the next step. All three requests are running. @@ -286,7 +284,10 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - ) + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) + scheduler.update_from_output(output1, model_runner_output) output2 = scheduler.schedule() assert len(scheduler.running) == 3 @@ -337,7 +338,10 @@ def test_stop_via_update_from_output(): 11]], # First request hits EOS, second continues spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}) + prompt_logprobs_dict={}, + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) scheduler.update_from_output(scheduler_output, model_output) @@ -385,7 +389,10 @@ def test_stop_via_update_from_output(): [13, 14]], # First request hits stop token spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}) + prompt_logprobs_dict={}, + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) scheduler.update_from_output(scheduler_output, model_output) @@ -432,7 +439,10 @@ def test_stop_via_update_from_output(): [13]], # First request exceeds max_tokens spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}) + prompt_logprobs_dict={}, + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) scheduler.update_from_output(scheduler_output, model_output) @@ -474,7 +484,10 @@ def test_stop_via_update_from_output(): sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}) + prompt_logprobs_dict={}, + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) scheduler.update_from_output(scheduler_output, model_output) @@ -524,7 +537,10 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool], spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - ) + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) + scheduler.update_from_output(scheduler_output0, model_runner_output) # Schedule the next step. @@ -541,7 +557,10 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool], spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - ) + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) + scheduler.update_from_output(scheduler_output1, model_runner_output) @@ -565,8 +584,6 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): 1. Speculated tokens get scheduled correctly 2. Spec decoding stats properly count number of draft and accepted tokens """ - if vllm_version_is("0.9.0"): - return num_spec_tokens = max(1, max(len(t) for t in spec_tokens)) scheduler = create_scheduler(num_speculative_tokens=num_spec_tokens) requests = create_requests(num_requests=len(spec_tokens), num_tokens=1) @@ -593,7 +610,10 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): spec_token_ids=spec_tokens, logprobs=None, prompt_logprobs_dict={}, - ) + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) + engine_core_outputs = scheduler.update_from_output(output, model_runner_output) @@ -632,7 +652,10 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - ) + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) + engine_core_outputs = scheduler.update_from_output(output, model_runner_output) @@ -727,7 +750,9 @@ def make_output(scheduler: AscendScheduler): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - ) + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) def assert_scheduler_empty(scheduler: AscendScheduler): @@ -744,11 +769,10 @@ def assert_scheduler_empty(scheduler: AscendScheduler): assert len(scheduler.encoder_cache_manager.cached) == 0 # KVCache Manager. - if not vllm_version_is("0.9.0"): - assert len(scheduler.kv_cache_manager.coordinator. - single_type_managers[0].req_to_blocks) == 0 - assert len(scheduler.kv_cache_manager.coordinator. - single_type_managers[0].num_cached_block) == 0 + assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. + req_to_blocks) == 0 + assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. + num_cached_block) == 0 assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0 num_free_blocks = ( scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) @@ -789,4 +813,4 @@ def test_memory_leak(): scheduler.update_from_output(scheduler_output, model_runner_output) # Confirm no memory leak. - assert_scheduler_empty(scheduler) \ No newline at end of file + assert_scheduler_empty(scheduler) diff --git a/tests/e2e/singlecard/test_scheduler.py b/tests/e2e/singlecard/test_scheduler.py index 8021f0306c..b3adf945bf 100644 --- a/tests/e2e/singlecard/test_scheduler.py +++ b/tests/e2e/singlecard/test_scheduler.py @@ -31,6 +31,7 @@ from vllm.v1.structured_output import StructuredOutputManager from vllm_ascend.core.scheduler import AscendScheduler +from vllm_ascend.utils import vllm_version_is EOS_TOKEN_ID = 50256 @@ -130,6 +131,9 @@ def create_requests(num_requests: int, multi_modal_placeholders=mm_position, multi_modal_hashes=None, eos_token_id=EOS_TOKEN_ID, + **({ + "pooling_params": None + } if not vllm_version_is("0.9.1") else {}), ) requests.append(request) return requests @@ -237,7 +241,10 @@ def test_stop_via_update_from_output(): 11]], # First request hits EOS, second continues spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}) + prompt_logprobs_dict={}, + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) scheduler.update_from_output(scheduler_output, model_output) @@ -283,7 +290,10 @@ def test_stop_via_update_from_output(): [13, 14]], # First request hits stop token spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}) + prompt_logprobs_dict={}, + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) scheduler.update_from_output(scheduler_output, model_output) @@ -328,7 +338,10 @@ def test_stop_via_update_from_output(): [13]], # First request exceeds max_tokens spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}) + prompt_logprobs_dict={}, + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) scheduler.update_from_output(scheduler_output, model_output) @@ -369,7 +382,10 @@ def test_stop_via_update_from_output(): sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}) + prompt_logprobs_dict={}, + **({ + "pooler_output": [] + } if not vllm_version_is("0.9.1") else {})) scheduler.update_from_output(scheduler_output, model_output) diff --git a/tests/ut/ops/test_expert_load_balancer.py b/tests/ut/ops/test_expert_load_balancer.py index 049f916fc9..3b7a69ddd4 100644 --- a/tests/ut/ops/test_expert_load_balancer.py +++ b/tests/ut/ops/test_expert_load_balancer.py @@ -3,9 +3,10 @@ import vllm_ascend.patch.worker.patch_common.patch_utils # type: ignore[import] # isort: skip # noqa import json +import unittest from typing import List, TypedDict +from unittest import mock -import pytest import torch from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer @@ -46,101 +47,101 @@ class MockData(TypedDict): } -@pytest.fixture -def mock_expert_load_balancer(tmp_path): - json_file = tmp_path / "expert_map.json" - with open(json_file, 'w') as f: - json.dump(MOCK_DATA, f) - - return ExpertLoadBalancer(str(json_file), global_expert_num=8) - - -def test_init(mock_expert_load_balancer): - assert isinstance(mock_expert_load_balancer.expert_map_tensor, - torch.Tensor) - assert mock_expert_load_balancer.layers_num == MOCK_DATA["moe_layer_count"] - assert mock_expert_load_balancer.ranks_num == MOCK_DATA["layer_list"][0][ - "device_count"] - - -def test_generate_index_dicts(mock_expert_load_balancer): - tensor_2d = torch.tensor([[7, 2, 0, 3, 5], [6, 1, 4, 7, 2]]) - result = mock_expert_load_balancer.generate_index_dicts(tensor_2d) - expected_result = [{ - 7: 0, - 2: 1, - 0: 2, - 3: 3, - 5: 4 - }, { - 6: 5, - 1: 6, - 4: 7, - 7: 8, - 2: 9 - }] - assert result == expected_result - - -def test_generate_expert_placement_map(mock_expert_load_balancer): - expert_placement_map = mock_expert_load_balancer.generate_expert_placement_map( - ) - assert expert_placement_map.shape == (mock_expert_load_balancer.layers_num, - mock_expert_load_balancer.ranks_num, - 8) - assert torch.all(expert_placement_map >= -1) - - -def test_generate_log2phy_expert_map(mock_expert_load_balancer): - layer_id = 0 - log2phy_map = mock_expert_load_balancer.generate_log2phy_expert_map( - layer_id) - assert log2phy_map.shape == (mock_expert_load_balancer.ranks_num, 8) - assert torch.all(log2phy_map >= -1) - - -def test_get_rank_placement_map(mock_expert_load_balancer, mocker): - mocker.patch("torch_npu.npu._lazy_init") - mocker.patch('torch.npu.current_device', return_value='cpu') - layer_id = 0 - rank_id = 0 - rank_local_expert_num, rank_expert_map = mock_expert_load_balancer.get_rank_placement_map( - layer_id, rank_id) - assert rank_local_expert_num == 5 - expected_tensor = torch.tensor([2, -1, 1, 3, -1, 4, -1, 0], - dtype=torch.int32).to( - rank_expert_map.device) - assert rank_expert_map.equal(expected_tensor) - - rank_id = 1 - rank_local_expert_num, rank_expert_map = mock_expert_load_balancer.get_rank_placement_map( - layer_id, rank_id) - expected_tensor = torch.tensor([-1, 1, 4, -1, 2, -1, 0, 3], - dtype=torch.int32).to( - rank_expert_map.device) - assert rank_expert_map.equal(expected_tensor) - - -def test_get_rank_log2phy_map(mock_expert_load_balancer): - layer_id = 0 - rank_id = 0 - log2phy_map = mock_expert_load_balancer.get_rank_log2phy_map( - layer_id, rank_id) - expected_tensor = torch.tensor([2, 6, 1, 3, 7, 4, 5, 0], - dtype=torch.int32).to(log2phy_map.device) - assert log2phy_map.equal(expected_tensor) - - rank_id = 1 - log2phy_map = mock_expert_load_balancer.get_rank_log2phy_map( - layer_id, rank_id) - expected_tensor = torch.tensor([2, 6, 9, 3, 7, 4, 5, 8], - dtype=torch.int32).to(log2phy_map.device) - assert log2phy_map.equal(expected_tensor) - - -def test_get_global_redundant_expert_num(mock_expert_load_balancer): - redundant_expert_num = mock_expert_load_balancer.get_global_redundant_expert_num( - ) - expected_redundant_expert_num = len(MOCK_DATA["layer_list"][0]["device_list"][0]["device_expert"]) * \ - MOCK_DATA["layer_list"][0]["device_count"] - 8 - assert redundant_expert_num == expected_redundant_expert_num +class TestExpertLoadBalancer(unittest.TestCase): + + def setUp(self): + json_file = "expert_map.json" + with open(json_file, 'w') as f: + json.dump(MOCK_DATA, f) + + self.expert_load_balancer = ExpertLoadBalancer(json_file, + global_expert_num=8) + + def test_init(self): + + self.assertIsInstance(self.expert_load_balancer.expert_map_tensor, + torch.Tensor) + self.assertEqual(self.expert_load_balancer.layers_num, + MOCK_DATA["moe_layer_count"]) + self.assertEqual(self.expert_load_balancer.ranks_num, + MOCK_DATA["layer_list"][0]["device_count"]) + + def test_generate_index_dicts(self): + tensor_2d = torch.tensor([[7, 2, 0, 3, 5], [6, 1, 4, 7, 2]]) + result = self.expert_load_balancer.generate_index_dicts(tensor_2d) + expected_result = [{ + 7: 0, + 2: 1, + 0: 2, + 3: 3, + 5: 4 + }, { + 6: 5, + 1: 6, + 4: 7, + 7: 8, + 2: 9 + }] + self.assertEqual(result, expected_result) + + def test_generate_expert_placement_map(self): + expert_placement_map = self.expert_load_balancer.generate_expert_placement_map( + ) + self.assertEqual(expert_placement_map.shape, + (self.expert_load_balancer.layers_num, + self.expert_load_balancer.ranks_num, 8)) + self.assertTrue(torch.all(expert_placement_map >= -1)) + + def test_generate_log2phy_expert_map(self): + layer_id = 0 + log2phy_map = self.expert_load_balancer.generate_log2phy_expert_map( + layer_id) + self.assertEqual(log2phy_map.shape, + (self.expert_load_balancer.ranks_num, 8)) + self.assertTrue(torch.all(log2phy_map >= -1)) + + @mock.patch("torch_npu.npu._lazy_init") + @mock.patch("torch.npu.current_device", return_value="cpu") + def test_get_rank_placement_map(self, mock_current_device, mock_lazy_init): + layer_id = 0 + rank_id = 0 + rank_local_expert_num, rank_expert_map = self.expert_load_balancer.get_rank_placement_map( + layer_id, rank_id) + self.assertEqual(rank_local_expert_num, 5) + expected_tensor = torch.tensor([2, -1, 1, 3, -1, 4, -1, 0], + dtype=torch.int32).to( + rank_expert_map.device) + self.assertTrue(rank_expert_map.equal(expected_tensor)) + + rank_id = 1 + rank_local_expert_num, rank_expert_map = self.expert_load_balancer.get_rank_placement_map( + layer_id, rank_id) + expected_tensor = torch.tensor([-1, 1, 4, -1, 2, -1, 0, 3], + dtype=torch.int32).to( + rank_expert_map.device) + self.assertTrue(rank_expert_map.equal(expected_tensor)) + + def test_get_rank_log2phy_map(self): + layer_id = 0 + rank_id = 0 + log2phy_map = self.expert_load_balancer.get_rank_log2phy_map( + layer_id, rank_id) + expected_tensor = torch.tensor([2, 6, 1, 3, 7, 4, 5, 0], + dtype=torch.int32).to( + log2phy_map.device) + self.assertTrue(log2phy_map.equal(expected_tensor)) + + rank_id = 1 + log2phy_map = self.expert_load_balancer.get_rank_log2phy_map( + layer_id, rank_id) + expected_tensor = torch.tensor([2, 6, 9, 3, 7, 4, 5, 8], + dtype=torch.int32).to( + log2phy_map.device) + self.assertTrue(log2phy_map.equal(expected_tensor)) + + def test_get_global_redundant_expert_num(self): + redundant_expert_num = self.expert_load_balancer.get_global_redundant_expert_num( + ) + expected_redundant_expert_num = len(MOCK_DATA["layer_list"][0]["device_list"][0]["device_expert"]) * \ + MOCK_DATA["layer_list"][0]["device_count"] - 8 + self.assertEqual(redundant_expert_num, expected_redundant_expert_num) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 801819b039..89f30bc43c 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -74,7 +74,7 @@ from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata from vllm_ascend.platform import NPUPlatform from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler -from vllm_ascend.utils import ProfileExecuteDuration +from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer if TYPE_CHECKING: @@ -420,19 +420,33 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: generator.manual_seed(sampling_params.seed) else: generator = None - - self.requests[req_id] = CachedRequestState( - req_id=req_id, - prompt_token_ids=new_req_data.prompt_token_ids, - mm_inputs=new_req_data.mm_inputs, - mm_positions=new_req_data.mm_positions, - sampling_params=sampling_params, - generator=generator, - block_ids=new_req_data.block_ids, - num_computed_tokens=new_req_data.num_computed_tokens, - output_token_ids=[], - lora_request=new_req_data.lora_request, - ) + if vllm_version_is("0.9.1"): + self.requests[req_id] = CachedRequestState( + req_id=req_id, + prompt_token_ids=new_req_data.prompt_token_ids, + mm_inputs=new_req_data.mm_inputs, + mm_positions=new_req_data.mm_positions, + sampling_params=sampling_params, + generator=generator, + block_ids=new_req_data.block_ids, + num_computed_tokens=new_req_data.num_computed_tokens, + output_token_ids=[], + lora_request=new_req_data.lora_request, + ) + else: + self.requests[req_id] = CachedRequestState( + req_id=req_id, + prompt_token_ids=new_req_data.prompt_token_ids, + mm_inputs=new_req_data.mm_inputs, + mm_positions=new_req_data.mm_positions, + sampling_params=sampling_params, + pooling_params=None, + generator=generator, + block_ids=new_req_data.block_ids, + num_computed_tokens=new_req_data.num_computed_tokens, + output_token_ids=[], + lora_request=new_req_data.lora_request, + ) # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: @@ -1305,15 +1319,25 @@ def execute_model( hidden_states, attn_metadata, ) - - model_runner_output = ModelRunnerOutput( - req_ids=self.input_batch.req_ids, - req_id_to_index=self.input_batch.req_id_to_index, - sampled_token_ids=valid_sampled_token_ids, - spec_token_ids=spec_token_ids, - logprobs=logprobs_lists, - prompt_logprobs_dict={}, - ) + if vllm_version_is("0.9.1"): + model_runner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=valid_sampled_token_ids, + spec_token_ids=spec_token_ids, + logprobs=logprobs_lists, + prompt_logprobs_dict={}, + ) + else: + model_runner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=valid_sampled_token_ids, + spec_token_ids=spec_token_ids, + logprobs=logprobs_lists, + prompt_logprobs_dict={}, + pooler_output=[], + ) durations = ProfileExecuteDuration().pop_captured_sync() if durations: From 22de4ee1970f981520d99fb1ffd596f6d87a1f90 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Fri, 20 Jun 2025 10:16:34 +0800 Subject: [PATCH 107/153] fix warm up & change init expert map from file --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 21 +++++++++++++++++++++ vllm_ascend/eplb/eplb_updator.py | 11 ++++++----- vllm_ascend/worker/model_runner_v1.py | 15 ++++++++------- vllm_ascend/worker/worker_v1.py | 2 ++ 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 8e00bffd28..512a43821f 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -121,6 +121,27 @@ def get_init_expert_map(self, num_moe_layers): return all_expert_maps + def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path): + expert_map_tensor, layers_num, ranks_num = self._expert_file_to_tensor(expert_map_path) + for layer_idx in range(num_moe_layers): + self.expert_map_per_layer_cpu[layer_idx] = \ + expert_map_tensor[layer_idx][self.rank_id] + + def _expert_file_to_tensor(self, expert_map_path: str): + with open(expert_map_path, "r") as f: + data = json.load(f) + layers_num = data["moe_layer_count"] + gpus_num = data["layer_list"][0]["device_count"] + + tensor_data = [] + for layer in data["layer_list"]: + device_data = [] + for device in layer["device_list"]: + device_data.append(device["device_expert"]) + tensor_data.append(device_data) + expert_map_tensor = torch.tensor(tensor_data, dtype=torch.int32) + return expert_map_tensor, layers_num, gpus_num + def do_update_expert_map(self, layer_id, updated_expert_map): self.expert_map_per_layer[layer_id].copy_(updated_expert_map) self.expert_map_per_layer_cpu[layer_id].copy_(updated_expert_map) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index c6f087cdbb..9418cd4a89 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -24,18 +24,19 @@ class EplbUpdator: - def __init__(self, redundant_enable): - self.init_eplb(redundant_enable) + def __init__(self, expert_map_path): + self.init_eplb(expert_map_path) def set_adaptor(self, adaptor): self.adaptor = adaptor self.eplb_loader = D2DExpertWeightLoader(eplb_adaptor=self.adaptor) self.num_moe_layers = self.adaptor.num_moe_layers - def init_eplb(self, redundant_enable): + def init_eplb(self, expert_map_path): - self.redundant_enable = redundant_enable + self.redundant_enable = (expert_map_path != None) self.num_iterations: torch.int64 = 130 + self.expert_map_path = expert_map_path self.weight_update_counter = 0 self.expert_map_initialized = False @@ -82,7 +83,7 @@ def get_update_iteration(self): def get_init_expert_map(self): try: if not self.expert_map_initialized: - self.shared_dict["expert_maps"] = self.adaptor.get_init_expert_map(self.num_moe_layers) + self.shared_dict["expert_maps"] = self.adaptor.get_init_expert_map_from_file(self.num_moe_layers, self.expert_map_path) self.expert_map_initialized = True except Exception as e: logger.warning(f"[ModelRunner] Failed to wake EPLB process: {e}", exc_info=True) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index f0ba0980cd..0e07f894a2 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -375,7 +375,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.dynamic_eplb = ascend_config.dynamic_eplb if self.dynamic_eplb == True: self.eplb_adaptor = None - self.eplb_updator = EplbUpdator(ascend_config.expert_map_path != None) + self.eplb_updator = EplbUpdator(ascend_config.expert_map_path) def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler @@ -1517,12 +1517,6 @@ def _dummy_run( intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds) - #EPLB - if self.dynamic_eplb == True: - self.eplb_adaptor = VllmEplbAdaptor(model=self.model) - self.eplb_updator.set_adaptor(self.eplb_adaptor) - self.eplb_updator.warm_up_eplb() - return hidden_states def profile_run(self) -> None: @@ -1564,6 +1558,13 @@ def profile_run(self) -> None: self.encoder_cache.clear() gc.collect() + def eplb_warmup(self): + #EPLBMore actions + if self.dynamic_eplb == True: + self.eplb_adaptor = VllmEplbAdaptor(model=self.model) + self.eplb_updator.set_adaptor(self.eplb_adaptor) + self.eplb_updator.warm_up_eplb() + def load_model(self) -> None: logger.info("Starting to load model %s...", self.model_config.model) diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index ba29dcfc56..d65d6bb403 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -203,6 +203,8 @@ def compile_or_warm_up_model(self) -> None: for size in sorted(warmup_sizes, reverse=True): logger.info("Compile and warming up model for size %d", size) self.model_runner._dummy_run(size) + self.model_runner.eplb_warmup() + if not self.model_config.enforce_eager: self.model_runner.capture_model() # Reset the seed to ensure that the random state is not affected by From e83f89d94dff16f074971aa227aea43727ebd361 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Fri, 20 Jun 2025 10:43:20 +0800 Subject: [PATCH 108/153] add moe load in worker_v1 --- vllm_ascend/eplb/eplb_updator.py | 5 +++++ vllm_ascend/worker/model_runner_v1.py | 3 +++ vllm_ascend/worker/worker_v1.py | 7 ++++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 9418cd4a89..1e1a372ee4 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -213,6 +213,11 @@ def unpack_update_batch(self, packed_update_info): ] return recovered + def get_expert_load(self) -> str:More actions + """todo 确认moe_load的值是什么类型""" + # return '{"a":"b"}' # mock + return self.shared_dict['moe_load'] + def shutdown(self): """ Clean up the EPLB process. diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 0e07f894a2..9ca6ebf2b7 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1558,6 +1558,9 @@ def profile_run(self) -> None: self.encoder_cache.clear() gc.collect() + def do_get_expert_load(self) -> str:More actions + return self.eplb_updator.get_expert_load() + def eplb_warmup(self): #EPLBMore actions if self.dynamic_eplb == True: diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index d65d6bb403..c42e697a8e 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -204,13 +204,18 @@ def compile_or_warm_up_model(self) -> None: logger.info("Compile and warming up model for size %d", size) self.model_runner._dummy_run(size) self.model_runner.eplb_warmup() - + if not self.model_config.enforce_eager: self.model_runner.capture_model() # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) + def get_expert_load(self) -> str: + """ todo 一共几个worker""" + moe_load = self.model_runner.do_get_expert_load()More actions + return moe_load + def get_model(self) -> nn.Module: return self.model_runner.get_model() From 1a21f30a7db094f470cac2df8e4a7f15630bb612 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Fri, 20 Jun 2025 14:46:33 +0800 Subject: [PATCH 109/153] fix warm up bugs --- vllm_ascend/worker/worker_v1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 2740357e43..fb4a0b6376 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -201,8 +201,8 @@ def compile_or_warm_up_model(self) -> None: for size in sorted(warmup_sizes, reverse=True): logger.info("Compile and warming up model for size %d", size) self.model_runner._dummy_run(size) - self.model_runner.eplb_warmup() - + + self.model_runner.eplb_warmup() if not self.model_config.enforce_eager: self.model_runner.capture_model() # Reset the seed to ensure that the random state is not affected by From 051f77a89fdd431ebd3404e5492a6d4259cdaa3d Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Fri, 20 Jun 2025 15:39:46 +0800 Subject: [PATCH 110/153] fix log2phy bug --- vllm_ascend/eplb/eplb_updator.py | 2 +- vllm_ascend/quantization/w8a8_dynamic.py | 1 - vllm_ascend/worker/model_runner_v1.py | 4 ++-- vllm_ascend/worker/worker_v1.py | 4 ++-- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 1e1a372ee4..3a237d3ac3 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -213,7 +213,7 @@ def unpack_update_batch(self, packed_update_info): ] return recovered - def get_expert_load(self) -> str:More actions + def get_expert_load(self) -> str: """todo 确认moe_load的值是什么类型""" # return '{"a":"b"}' # mock return self.shared_dict['moe_load'] diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 7e9eb94c1d..88f6890d50 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -118,7 +118,6 @@ def fused_experts_with_mc2( global_redundant_expert_num: int = 0, shared_experts: Optional[Any] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - if log2phy is not None: if log2phy is not None: topk_ids = log2phy[topk_ids] global_bs = 0 diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 713b811543..03e689b1fc 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1578,11 +1578,11 @@ def profile_run(self) -> None: self.encoder_cache.clear() gc.collect() - def do_get_expert_load(self) -> str:More actions + def do_get_expert_load(self) -> str: return self.eplb_updator.get_expert_load() def eplb_warmup(self): - #EPLBMore actions + #EPLB if self.dynamic_eplb == True: self.eplb_adaptor = VllmEplbAdaptor(model=self.model) self.eplb_updator.set_adaptor(self.eplb_adaptor) diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index fb4a0b6376..16e1bec219 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -201,7 +201,7 @@ def compile_or_warm_up_model(self) -> None: for size in sorted(warmup_sizes, reverse=True): logger.info("Compile and warming up model for size %d", size) self.model_runner._dummy_run(size) - + self.model_runner.eplb_warmup() if not self.model_config.enforce_eager: self.model_runner.capture_model() @@ -211,7 +211,7 @@ def compile_or_warm_up_model(self) -> None: def get_expert_load(self) -> str: """ todo 一共几个worker""" - moe_load = self.model_runner.do_get_expert_load()More actions + moe_load = self.model_runner.do_get_expert_load() return moe_load def get_model(self) -> nn.Module: From d0c98c9a63006ba66b7627c8373d9eaf682e6f3c Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Fri, 20 Jun 2025 17:37:21 +0800 Subject: [PATCH 111/153] fix bugs: batch_isend_irecv synchronization and dtype bug in log2phy --- vllm_ascend/eplb/core/loader/device_transfer_loader.py | 3 ++- vllm_ascend/eplb/tool/eplb_utils.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/eplb/core/loader/device_transfer_loader.py b/vllm_ascend/eplb/core/loader/device_transfer_loader.py index 20ff7e114b..579f653323 100644 --- a/vllm_ascend/eplb/core/loader/device_transfer_loader.py +++ b/vllm_ascend/eplb/core/loader/device_transfer_loader.py @@ -81,7 +81,8 @@ def asyn_expert_weight_transfer(self, reqs): # set asynchronous stream for d2d expert weight transfer if self.comm_op_list: - reqs = dist.batch_isend_irecv(self.comm_op_list) + ret_list = dist.batch_isend_irecv(self.comm_op_list) + reqs.extend(ret_list) self.state = ExpertWeightUpdateState.TRANSFERING diff --git a/vllm_ascend/eplb/tool/eplb_utils.py b/vllm_ascend/eplb/tool/eplb_utils.py index 527ac02cef..4192fa9507 100644 --- a/vllm_ascend/eplb/tool/eplb_utils.py +++ b/vllm_ascend/eplb/tool/eplb_utils.py @@ -57,7 +57,8 @@ def generate_log2phy_map(cls, expert_map): else: random_list = [random.choice(log2phy_map[positive_rank_idx, idx]) for _ in range(num_ranks - num_rank_holding_expert)] - log2phy_map[negative_rank_idx, idx] = torch.tensor(random_list) + log2phy_map[negative_rank_idx, idx] = torch.tensor(random_list,\ + dtype=log2phy_map.dtype) return log2phy_map From 9295a9c153b5ea4a0a66087c95055e6301ceb551 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Sat, 21 Jun 2025 14:22:53 +0800 Subject: [PATCH 112/153] add another check for new placement generated by eplb algorithm --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 3 ++- vllm_ascend/eplb/core/policy/dynamic_ep.py | 4 ++-- .../eplb/core/policy/policy_factory.py | 15 +++++++++++---- vllm_ascend/eplb/core/worker/eplb_worker.py | 19 ++++++++++++------- 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 512a43821f..111cd4adc6 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -18,6 +18,7 @@ import torch import torch.distributed as dist import numpy as np +import json from vllm_ascend.eplb.adaptor.abstract_adaptor import EplbAdaptor from vllm.logger import logger @@ -126,7 +127,7 @@ def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path): for layer_idx in range(num_moe_layers): self.expert_map_per_layer_cpu[layer_idx] = \ expert_map_tensor[layer_idx][self.rank_id] - + def _expert_file_to_tensor(self, expert_map_path: str): with open(expert_map_path, "r") as f: data = json.load(f) diff --git a/vllm_ascend/eplb/core/policy/dynamic_ep.py b/vllm_ascend/eplb/core/policy/dynamic_ep.py index 9023e09d43..c081191aab 100644 --- a/vllm_ascend/eplb/core/policy/dynamic_ep.py +++ b/vllm_ascend/eplb/core/policy/dynamic_ep.py @@ -21,7 +21,7 @@ class DynamicTable: placement_table = None -class DynamicEP(EplbPolicy): +class DynamicEplb(EplbPolicy): def __init__(self, config: DynamicConfig): super().__init__(config) @@ -233,7 +233,7 @@ def compute_balanced_pack(origin_weights, card_num): def get_redundant_num(npu_num, counts): redundant_num_each_npu = np.sum(counts - 1) return redundant_num_each_npu - + @staticmethod def calculate_max_heat_per_layer(workload_table, layer_num): max_heat_per_layer = [] diff --git a/vllm_ascend/eplb/core/policy/policy_factory.py b/vllm_ascend/eplb/core/policy/policy_factory.py index 4650f832a7..be0ff0323c 100644 --- a/vllm_ascend/eplb/core/policy/policy_factory.py +++ b/vllm_ascend/eplb/core/policy/policy_factory.py @@ -1,7 +1,8 @@ # Copyright Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. from .eplb_policy import EplbPolicy, DynamicConfig from .mock_load_balance import MockLoadBalance -from .dynamic_ep import DynamicEP +from .dynamic_ep import DynamicEplb +from .dynamic_ep_v2 import DynamicEplbV2 @@ -9,8 +10,14 @@ class PolicyFactory: @staticmethod def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy: policy = { - 0:MockLoadBalance , # MockLoadBalance with greedy d2d expert weight composing - 1:DynamicEP, # Dynamic EPLB policy with greedy d2d expert weight composing - 2:DynamicEP, # Dynamic EPLB policy with bipartite d2d expert weight composing + # Applying bipartite d2d expert weight update composing + 0:MockLoadBalance, # MockLoadBalance + 1:DynamicEplb, # Dynamic EPLB policy + 2:DynamicEplbV2, # Dynamic EPLB policy V2 + + # Applying greedy d2d expert weight update composing + 4:MockLoadBalance, # MockLoadBalance + 5:DynamicEplb, # Dynamic EPLB policy + 6:DynamicEplbV2, # Dynamic EPLB policy } return policy.get(policy_type, MockLoadBalance)(config) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 412fb1136a..f63996e4ed 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -71,9 +71,8 @@ def do_update(self): logger.debug(f"[EPLB Process new_map differs, performing D2D") - update_info = self.compose_expert_update_info_greedy(new_expert_maps, self.old_expert_maps)\ - if self.policy_type <= 1 else self.compose_expert_update_info_bipartite(new_expert_maps, self.old_expert_maps) - + update_info = self.compose_expert_update_info_bipartite(new_expert_maps, self.old_expert_maps)\ + if self.policy_type <= 2 else self.compose_expert_update_info_greedy(new_expert_maps, self.old_expert_maps) self.old_expert_maps = new_expert_maps logger.info("EPLB Process compute complete") @@ -86,6 +85,12 @@ def check_expert_placement(self, old_placement, new_placement): num_ranks = old_placement.shape[1] for layer_id in range(num_layers): + # check if any logical expert is not placed on any rank + if torch.unique(new_placement[layer_id]).numel() < torch.unique(old_placement[layer_id]).numel(): + logger.error(f"There exists expert not placed on any rank in layer {layer_id}") + new_placement[layer_id] = old_placement[layer_id] + continue + for rank_id in range(num_ranks): new_placement_check = new_placement[layer_id][rank_id] old_placement_check = old_placement[layer_id][rank_id] @@ -344,14 +349,14 @@ def pack_update_info(self, update_info_generator): maps.append(new_expert_map[self.rank_id]) if self.redundant_enable is not None: - log2phy_map = ExpertMapUtils.generate_log2phy_map(new_expert_map) + log2phy_map = ExpertMapUtils.generate_log2phy_map(new_expert_map) log2phy_all.append(log2phy_map) layer_ids.append(layer_id) # 把 list of Tensor 堆成一个大 Tensor - stacked_maps = torch.stack(maps, dim=0) - layer_id_tensor = torch.as_tensor(layer_ids, dtype=torch.int64) + stacked_maps = torch.stack(maps, dim=0) + layer_id_tensor = torch.as_tensor(layer_ids, dtype=torch.int64) stacked_maps.share_memory_() layer_id_tensor.share_memory_() @@ -362,7 +367,7 @@ def pack_update_info(self, update_info_generator): stacked_log2phy = None return send_all, recv_all, stacked_maps, stacked_log2phy, layer_id_tensor - + class EplbProcess: def __init__(self, shared_dict, planner_q, block_update_q, redundant_enable, policy_type: int = 0, enable_d2d: bool = True): """ From 67fa706e71d15fc307528680be72e07abe433859 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Sat, 21 Jun 2025 15:19:50 +0800 Subject: [PATCH 113/153] add dynamic_ep_v2 --- vllm_ascend/eplb/core/policy/dynamic_ep_v2.py | 871 ++++++++++++++++++ 1 file changed, 871 insertions(+) create mode 100644 vllm_ascend/eplb/core/policy/dynamic_ep_v2.py diff --git a/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py b/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py new file mode 100644 index 0000000000..49eeb4faeb --- /dev/null +++ b/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py @@ -0,0 +1,871 @@ +# Copyright Huawei Technologies Co., Ltd. 2024-2025. All rights reserved. +from collections import defaultdict +import numpy as np +from abc import abstractmethod + + +class DynamicConfig: + placement_policy = None + + max_transferred_expert_per_layer = 100 + # 一台机器上,一层最多搬运多少专家 + + ep_worldsize = 64 # 整个集群上所有的专家分布在多少个die上 + num_die_per_host = 8 # 每台机器上有几个die + + +class EplbPolicy: + def __init__(self, config: DynamicConfig): + self.config = config + + @abstractmethod + def rebalance_experts(self, current_expert_table, expert_workload): + """ + 传入weight并返回相关限制条件下的专家复制和放置 + INPUT: + current_expert_table: [layerId, rankId, expert_num_i] + expert_workload = expert_table[layer0][rankId][expert_num_i] + + RETURNED: (res, expert_table) + res: + 1 -- table_changed + 0 -- not_changed + + expert_table: [layerId, rankId, expert_num_i] + expert_num_i --- [0, MaxExpertPerRank] + expertID = expert_table[layer0][rankId][expert_num_i] + array_values: + [0, 1, 2, 3, 248] + [4, 5, 6, 7, 254] + [8, 9, 10, 11, 71] + ... + [252, 253, 254, 255, 0] + """ + pass + +class DynamicTable: + # workload_table: + # 三维矩阵,[layer, gpus, experts_per_gpu_per_layer] -> value: 所在位置的热度 + # 大小为 层数 * 卡数 * 每层每卡的专家数量 + # 里面i, j, k的元素代表 第 i 层 第 j 张卡第 k 个专家的热度 + # 对于收集不到的专家,填为 -1 + workload_table = None + + # placement_table: + # 三维矩阵,[layer, gpus, experts_per_gpu_per_layer] -> value: 所在位置的物理专家id + # 大小为 层数 * 卡数 * 每层每卡的专家数量 + # 里面i, j, k的元素代表 第 i 层 第 j 张卡第 k 个专家的物理id + # 对于收集不到的专家,填为 -1 + placement_table = None + + +class DynamicEplbV2(EplbPolicy): + + def __init__(self, config: DynamicConfig): + super().__init__(config) + + @staticmethod + def add_redundant(current_expert_table, expert_workload, num_original_expert): + layer_num, npu_num, experts_per_npu = expert_workload.shape + workload_new = np.zeros((layer_num, num_original_expert)) + for layer_idx in range(layer_num): + workload_dict = defaultdict(int) + placement_layer = current_expert_table[layer_idx].copy() + workload_layer = expert_workload[layer_idx].copy() + for npu_idx in range(npu_num): + for expert_idx in range(experts_per_npu): + workload_dict[placement_layer[npu_idx][expert_idx]] += workload_layer[npu_idx][expert_idx] + for expert_idx in range(num_original_expert): + workload_new[layer_idx][expert_idx] = workload_dict[expert_idx] + return workload_new + + @staticmethod + # 热点专家拆分为冗余专家 + def original_compute_balanced_pack_redundancy(origin_weights, card_num, num_redundancy_expert): + # Step 1: Sort the items by weight in descending order (we are sorting by weight now) + # Sort based on the second element (the second value of each tuple) + route_expert_num = len(origin_weights) + route_expert_redundancy = [[] for _ in range(route_expert_num)] + for i in range(num_redundancy_expert): + sorted_indices = np.argsort([t[1] for t in origin_weights], kind='stable')[::-1] + weights = [origin_weights[idx] for idx in sorted_indices] + tmp_raw_weight = weights[0][1] * (len(route_expert_redundancy[weights[0][0]]) + 1) + route_expert_redundancy[weights[0][0]].append(route_expert_num + i) + avg_weight = tmp_raw_weight / (len(route_expert_redundancy[weights[0][0]]) + 1) + weights[0] = (weights[0][0], avg_weight) + origin_weights = weights + + # Step 2: Calculate the number of items per box + expert_num = route_expert_num + num_redundancy_expert + items_per_box = expert_num // card_num # Number of items per box + remaining_items = expert_num % card_num # Number of items per box + + # Step 3: Initialize card_num boxes with empty lists to store item IDs + boxes = [[] for _ in range(card_num)] + boxes_weights = [[] for _ in range(card_num)] + box_weights = [0] * card_num # To store the total weight of each box + box_counts = [0] * card_num # To store the number of items in each box + index = 0 + for i in range(route_expert_num): + redundancy_num = len(route_expert_redundancy[i]) + for _ in range(redundancy_num): + cur_weight = 0 + for item, weight in origin_weights: + if item == i: + cur_weight = weight + + boxes[index].append(i) + boxes_weights[index].append(cur_weight) + box_weights[index] += cur_weight + box_counts[index] += 1 + index += 1 + + sorted_indices = np.argsort([t[1] for t in origin_weights], kind='stable')[::-1] + origin_weights = [origin_weights[idx] for idx in sorted_indices] + # Step 4: Distribute items into boxes based on weight + for item_id, weight in origin_weights: + # Find the box with the least items but not full + min_box_index = -1 + for i in range(card_num): + # Only choose boxes that still have space (box_counts[i] < items_per_box) + if box_counts[i] < items_per_box or (box_counts[i] == items_per_box and remaining_items > 0): + if min_box_index == -1 or box_weights[i] < box_weights[min_box_index]: + min_box_index = i + + # Place the item (id) into the selected box + boxes[min_box_index].append(item_id) + boxes_weights[min_box_index].append(weight) + box_weights[min_box_index] += weight + box_counts[min_box_index] += 1 + + # If there's an imbalance in the remaining items, reduce the "remaining_items" counter + if box_counts[min_box_index] == (items_per_box + 1) and remaining_items > 0: + remaining_items -= 1 + + # Step 5: Output each box's contents and total weight + result = [] + for i in range(card_num): + result.append({ + "box_index": i + 1, + "items": boxes[i], # List of item IDs in the box + "weight": boxes_weights[i], + "total_weight": box_weights[i], # Total weight in this box + "item_count": box_counts[i] # Number of items in the box + }) + + return result, boxes + + # 热点专家拆分为冗余专家 + @staticmethod + def compute_balanced_pack_redundancy(origin_weights, card_num, num_redundancy_expert): + route_expert_num = len(origin_weights) + route_expert_redundancy = [[] for _ in range(route_expert_num)] + for i in range(num_redundancy_expert): + sorted_indices = np.argsort([t[1] for t in origin_weights], kind='stable')[::-1] + weights = [origin_weights[idx] for idx in sorted_indices] + tmp_raw_weight = weights[0][1] * (len(route_expert_redundancy[weights[0][0]]) + 1) + route_expert_redundancy[weights[0][0]].append(route_expert_num + i) + avg_weight = tmp_raw_weight / (len(route_expert_redundancy[weights[0][0]]) + 1) + weights[0] = (weights[0][0], avg_weight) + origin_weights = weights + + expert_num = route_expert_num + num_redundancy_expert + if card_num == 0: + raise RuntimeError("card_num can not be 0.") + items_per_box = expert_num // card_num + remaining_items = expert_num % card_num + + boxes = [[] for _ in range(card_num)] + boxes_weights = [[] for _ in range(card_num)] + box_weights = [0] * card_num + box_counts = [0] * card_num + + all_weights = np.zeros((expert_num,), dtype='object') + all_weights[: route_expert_num] = origin_weights + + index = route_expert_num + for i in range(route_expert_num): + redundancy_num = len(route_expert_redundancy[i]) + for _ in range(redundancy_num): + for item, weight in origin_weights: + if item == i: + all_weights[index] = (item, weight) + index += 1 + + sorted_indices = np.argsort([t[1] for t in all_weights], kind='stable')[::-1] + all_weights = [all_weights[idx] for idx in sorted_indices] + for item_id, weight in all_weights: + min_box_index = -1 + for i in range(card_num): + if box_counts[i] < items_per_box or (box_counts[i] == items_per_box and remaining_items > 0): + if min_box_index == -1 or box_weights[i] < box_weights[min_box_index]: + if item_id not in boxes[i]: + min_box_index = i + + boxes[min_box_index].append(item_id) + boxes_weights[min_box_index].append(weight) + box_weights[min_box_index] += weight + box_counts[min_box_index] += 1 + + if box_counts[min_box_index] == (items_per_box + 1) and remaining_items > 0: + remaining_items -= 1 + + result = [] + for i in range(card_num): + result.append({ + "box_index": i + 1, + "items": boxes[i], + "weight": boxes_weights[i], + "total_weight": box_weights[i], + "item_count": box_counts[i] + }) + + return result, boxes + + # 无冗余专家方案 + @staticmethod + def compute_balanced_pack(origin_weights, card_num): + sorted_indices = np.argsort([t[1] for t in origin_weights])[::-1] + weights = origin_weights[sorted_indices] + expert_num = len(weights) + if card_num == 0: + raise RuntimeError("card_num can not be 0.") + items_per_box = expert_num // card_num + remaining_items = expert_num % card_num + + boxes = [[] for _ in range(card_num)] + boxes_weights = [[] for _ in range(card_num)] + box_weights = [0] * card_num + box_counts = [0] * card_num + + for item_id, weight in weights: + min_box_index = -1 + for i in range(card_num): + if box_counts[i] < items_per_box or (box_counts[i] == items_per_box and remaining_items > 0): + if min_box_index == -1 or box_weights[i] < box_weights[min_box_index]: + min_box_index = i + + boxes[min_box_index].append(item_id) + boxes_weights[min_box_index].append(weight) + box_weights[min_box_index] += weight + box_counts[min_box_index] += 1 + + if box_counts[min_box_index] == (items_per_box + 1) and remaining_items > 0: + remaining_items -= 1 + + result = [] + for i in range(card_num): + result.append({ + "box_index": i + 1, + "items": boxes[i], + "weight": boxes_weights[i], + "total_weight": box_weights[i], + "item_count": box_counts[i] + }) + + return result, boxes + + @staticmethod + def get_redundant_num(npu_num, counts): + redundant_num_each_npu = np.sum(counts - 1) + return redundant_num_each_npu + + @staticmethod + def calculate_max_heat_per_layer(workload_table, layer_num): + max_heat_per_layer = [] + for layer_idx in range(layer_num): + npu_heats_now = np.sum(workload_table[layer_idx], axis=1) + max_heat_per_layer.append(np.max(npu_heats_now)) + return max_heat_per_layer + + @staticmethod + def calculate_initial_imbalance(global_deployment, new_layer_workloads): + + device_num = global_deployment.shape[1] + layer_imbalance = [] + expert_num = np.zeros_like(new_layer_workloads) + # 基于部署做更新负载 + for layer_id, layer in enumerate(global_deployment): + for device in layer: + for expert_id in device: + expert_num[layer_id][expert_id] += 1 + + for layer_id, layer in enumerate(global_deployment): + cur_layer_max_workload = 0 + total_workload = 0 + for box in layer: + box_workload = 0 + for expert_id in box: + update_workload = new_layer_workloads[layer_id][expert_id] / expert_num[layer_id][expert_id] + box_workload += update_workload + total_workload += update_workload + if cur_layer_max_workload < box_workload: + cur_layer_max_workload = box_workload + + cur_layer_imbalance = cur_layer_max_workload / (total_workload / device_num) + layer_imbalance.append(cur_layer_imbalance) + + return layer_imbalance + + def rebalance_experts(self, current_expert_table, expert_workload): + + info = DynamicTable() + info.workload_table = np.array(expert_workload) + info.placement_table = np.array(current_expert_table) + layer_num, num_npus, experts_per_npu = info.workload_table.shape + expert_ids, counts = np.unique(info.placement_table[0], return_counts=True) + num_redundancy_expert = self.get_redundant_num(num_npus, counts) + num_original_expert = len(expert_ids) + layer_workloads = self.add_redundant(info.placement_table, info.workload_table, num_original_expert) + max_heat_per_layer_before = self.calculate_max_heat_per_layer(info.workload_table, layer_num) + npu_heat_all_origin = sum(max_heat_per_layer_before) + + # 计算负载均衡,部署冗余专家 + layer_num = layer_workloads.shape[0] + expert_num = layer_workloads.shape[1] + # 校验专家数量、卡数量、冗余专家数量不能超过卡数量 + if num_original_expert != expert_num: + raise ValueError(f"原始专家数量 {num_original_expert} 必须等于 expert_num {expert_num}") + + if num_npus <= 0: + raise ValueError("NPUs 数量必须大于 0") + + if num_npus < num_redundancy_expert: + raise ValueError(f"NPUs 数量 {num_npus} 必须大于或等于冗余专家数量 {num_redundancy_expert}") + + # 每个卡部署的专家数量 一个冗余专家 + global_deployment = [[[] for _ in range(num_npus)] for _ in range(layer_num)] + # 遍历获得每一层的放置策略,考虑计算均衡 + max_heat_per_layer_after = np.zeros([layer_num]) + for layer in range(layer_num): + # 获取当前层专家ID和对应负载,负载需要进行正则化处理, 每个卡加一个冗余专家 + weights = np.zeros((expert_num,), dtype='object') + for expert_id, workload_weight in enumerate(layer_workloads[layer]): + weights[expert_id] = (expert_id, workload_weight) + + # 获取每一层全局计算均衡的放置策略 + result, layer_deployment = self.original_compute_balanced_pack_redundancy( + weights, num_npus, num_redundancy_expert + ) + global_deployment[layer] = layer_deployment + max_heat_per_layer_after[layer] = max(result, key=lambda x: x['total_weight'])['total_weight'] + + # 获取层优先级 + layer_changed_ratio = [] + for layer_idx in range(layer_num): + layer_changed_ratio.append(max_heat_per_layer_after[layer_idx] / max_heat_per_layer_before[layer_idx]) + + per_layer_priority = np.argsort(layer_changed_ratio) + npu_heat_all_after = sum(max_heat_per_layer_after) + + change = 0 + + return change, per_layer_priority, np.array(global_deployment).tolist() + + @staticmethod + def compute_redundant_assignments(base_experts, num_redundant_experts, num_experts): + """ + 计算每个基础专家需要分配的冗余专家,并动态调整专家权重 + 返回冗余分配表和更新后的基础专家权重列表 + """ + redundant_assignments = [[] for _ in range(num_experts)] + current_weights = base_experts.copy() + + for i in range(num_redundant_experts): + # 按权重降序排序(使用稳定排序保持相同权重的顺序) + sorted_indices = np.argsort([w for _, w in current_weights], kind='stable')[::-1] + sorted_weights = [current_weights[i] for i in sorted_indices] + + # 选择当前权重最高的专家 + target_expert = sorted_weights[0] + expert_id, original_weight = target_expert + + # 计算添加冗余后的新平均权重 + current_redundancy = len(redundant_assignments[expert_id]) + new_avg_weight = original_weight * (current_redundancy + 1) / (current_redundancy + 2) + + # 更新分配表和权重列表 + redundant_assignments[expert_id].append(num_experts + i) + current_weights[sorted_indices[0]] = (expert_id, new_avg_weight) + + sorted_indices = np.argsort([w for _, w in current_weights], kind='stable')[::-1] + sorted_weights = [current_weights[i] for i in sorted_indices] + + return redundant_assignments, sorted_weights + + @staticmethod + def prepare_expert_list(base_experts, redundant_assignments, num_redundant_experts): + """ + 生产冗余专家的完整列表,并按权重降序排序 + """ + redundant_expert_list = np.empty(num_redundant_experts, dtype=object) + + # 填充冗余专家(使用对应基础专家的当前权重) + index = 0 + num_experts = len(redundant_assignments) + for expert_id in range(num_experts): + for _ in redundant_assignments[expert_id]: + redundant_expert_list[index] = (expert_id, next(w for eid, w in base_experts if eid == expert_id)) + index += 1 + + # 按权重降序排序 + sorted_indices = np.argsort([w for _, w in redundant_expert_list], kind='stable')[::-1] + return [redundant_expert_list[i] for i in sorted_indices] + + @staticmethod + def non_redundant_expert_information(origin_deployment, updated_weights, num_radundant_experts): + + device_num = len(origin_deployment) + + device_assignments = [[] for _ in range(device_num)] + device_weights = [[] for _ in range(device_num)] + device_loads = [0] * device_num + device_counts = [0] * device_num + if num_radundant_experts: + start_id = 1 + else: + start_id = 0 + + # 统计卡上非冗余专家信息 + for box_id, box in enumerate(origin_deployment): + for i in range(start_id, len(box)): + device_assignments[box_id].append(box[i]) + cur_weight = next(weight for expert_id, weight in updated_weights if expert_id == box[i]) + device_weights[box_id].append(cur_weight) + device_loads[box_id] += cur_weight + device_counts[box_id] += 1 + + return device_assignments, device_weights, device_loads, device_counts + + @staticmethod + def distribute_redun_experts(device_assignments, device_weights, device_loads, device_counts, redundant_expert_list, + items_per_device, expert_form_device, num_experts): + + num_devices = len(device_assignments) + com_between_devices = [{} for _ in range(num_devices)] + + for expert_id, weight in redundant_expert_list: + # 寻找最优设备(满足容量限制且负载最小) + candidate = -1 + for dev_id in range(num_devices): + # 保证设备内节点不同 + if expert_id in device_assignments[dev_id]: + continue + # 检查容量限制 + if device_counts[dev_id] < items_per_device: + # 选择负载最小的候选设备 + if candidate == -1 or device_loads[dev_id] < device_loads[candidate]: + candidate = dev_id + if candidate != -1: + # 分配专家到选定的设备 + device_assignments[candidate].insert(0, expert_id) + device_weights[candidate].insert(0, weight) + device_loads[candidate] += weight + device_counts[candidate] += 1 + + communication_box_index = expert_form_device[expert_id] + com_between_devices[candidate][communication_box_index] = expert_id + # 极端情况下存在冗余专家没装箱 导致箱子有空位 随机填入专家 待优化 + for dev_id in range(num_devices): + # 检查容量限制 + if device_counts[dev_id] < items_per_device: + # 遍历合适的专家 + for expert_id in range(num_experts): + if expert_id not in device_assignments[dev_id]: + # 找到对应权重 + weight = 0 + for i in range(num_devices): + for j in range(len(device_assignments[i])): + if expert_id == device_assignments[i][j]: + weight = device_weights[i][j] + # 和该专家相关的卡权重发生变化 待修改 + device_assignments[dev_id].insert(0, expert_id) + device_weights[dev_id].insert(0, weight) + device_loads[dev_id] += weight + device_counts[dev_id] += 1 + + communication_box_index = expert_form_device[expert_id] + com_between_devices[dev_id][communication_box_index] = expert_id + break + #todo 重新生成权重 + + return device_assignments, device_weights, device_loads, device_counts, com_between_devices + + @staticmethod + def redundancy_again(self, origin_weights, num_redundant_experts, origin_deployment, expert_form_device, num_node, + is_node_redundant): + + # 每张卡上专家数量 + expert_num_per_device = origin_deployment.shape[1] + + num_experts = len(origin_weights) + if is_node_redundant: + num_experts = num_experts * num_node + + # 根据新负载重新计算冗余专家 + redundant_assignments, updated_weights = self.compute_redundant_assignments(origin_weights, + num_redundant_experts, + num_experts) + + # 收集冗余专家信息并排序 + redundant_expert_list = self.prepare_expert_list(updated_weights, redundant_assignments, num_redundant_experts) + + # 收集重新计算冗余后卡上非冗余专家信息 + device_assignments, device_weights, device_loads, device_counts = self.non_redundant_expert_information( + origin_deployment, updated_weights, num_redundant_experts) + + # 新计算的冗余专家进行分配 + device_assignments, device_weights, device_loads, device_counts, com_between_devices = self.distribute_redun_experts( + device_assignments, + device_weights, + device_loads, + device_counts, + redundant_expert_list, + expert_num_per_device, + expert_form_device, + num_experts) + + + return device_assignments, device_weights, device_loads, device_counts, com_between_devices + + @staticmethod + def generate_allocation_report(device_assignments, device_weights, device_loads, device_counts): + """ + 生成最终分配报告并计算最大负载 + """ + report = [] + max_load = 0.0 + + for dev_id in range(len(device_assignments)): + current_load = device_loads[dev_id] + max_load = max(max_load, current_load) + + report.append({ + "device_id": dev_id + 1, + "assigned_experts": device_assignments[dev_id], + "expert_weights": device_weights[dev_id], + "total_load": current_load, + "expert_count": device_counts[dev_id] + }) + + return report, max_load + + @staticmethod + def exchange_expert(cur_exchange_index, next_exchange_index, cur_device_id, next_device_id, cur_layer_result, + com_between_devices): + + cur_device_deployment = cur_layer_result[cur_device_id]['assigned_experts'] + next_device_deployment = cur_layer_result[next_device_id]['assigned_experts'] + + cur_device_weight = cur_layer_result[cur_device_id]['expert_weights'] + next_device_weight = cur_layer_result[next_device_id]['expert_weights'] + + # 两张卡上对应的两个专家进行交换 + cur_expert_id = cur_device_deployment[cur_exchange_index] + next_expert_id = next_device_deployment[next_exchange_index] + cur_device_deployment[cur_exchange_index] = next_expert_id + next_device_deployment[next_exchange_index] = cur_expert_id + + cur_expert_weight = cur_device_weight[cur_exchange_index] + next_expert_weight = next_device_weight[next_exchange_index] + cur_device_weight[cur_exchange_index] = next_expert_weight + next_device_weight[next_exchange_index] = cur_expert_weight + + cur_layer_result[cur_device_id]['total_load'] += next_expert_weight - cur_expert_weight + cur_layer_result[next_device_id]['total_load'] += cur_expert_weight - next_expert_weight + + # 记录这两卡进行了通信 + com_between_devices[cur_device_id][next_device_id] = next_expert_id + com_between_devices[next_device_id][cur_device_id] = cur_expert_id + + @staticmethod + # 分层调整冗余专家 + def redundant_expert_deployment(self, layer_workloads, original_deployment, expert_form_device, node_num, + is_node_redundant): + device_num, per_device_expert_num = original_deployment.shape + route_expert_num = layer_workloads.shape[0] + redundancy_expert_num = per_device_expert_num * device_num - route_expert_num + per_node_device_num = device_num // node_num + per_node_route_expert_num = per_node_device_num * (per_device_expert_num - 1) + per_node_redun_expert_num = redundancy_expert_num // node_num + + weights = np.zeros((route_expert_num,), dtype='object') + for expert_id, workload_weight in enumerate(layer_workloads): + weights[expert_id] = (expert_id, int(workload_weight)) + + if is_node_redundant: + + device_assignments = [] + device_weights = [] + device_loads = [] + device_counts = [] + com_between_devices = [] + + for node_id in range(node_num): + cur_node_weights = weights[ + node_id * per_node_route_expert_num: (node_id + 1) * per_node_route_expert_num] + cur_original_deployment = original_deployment[ + node_id * per_node_device_num: (node_id + 1) * per_node_device_num] + + cur_device_assignments, cur_device_weights, cur_device_loads, cur_device_counts, cur_com_between_devices = self.redundancy_again( + self, + cur_node_weights, + per_node_redun_expert_num, + cur_original_deployment, + expert_form_device, + node_num, + is_node_redundant) + device_assignments += cur_device_assignments + device_weights += cur_device_weights + device_loads += cur_device_loads + device_counts += cur_device_counts + com_between_devices += cur_com_between_devices + + else: + device_assignments, device_weights, device_loads, device_counts, com_between_devices = self.redundancy_again( + self, + weights, + redundancy_expert_num, + original_deployment, + expert_form_device, + node_num, + is_node_redundant) + # 生成报告 + report, max_load = self.generate_allocation_report(device_assignments, device_weights, device_loads, + device_counts) + + return report, max_load, com_between_devices + + @staticmethod + def two_device_exchange_experts(cur_device_result, exchange_device_result, cur_exchanged_expert_id, + next_exchanged_expert_id, ave_workload, increment, num_redundancy_expert): + + cur_device_weight = cur_device_result['expert_weights'] + next_device_weight = exchange_device_result['expert_weights'] + + cur_device_expert_id = cur_device_result['assigned_experts'] + next_device_expert_id = exchange_device_result['assigned_experts'] + + cur_device_total_weight = int(cur_device_result['total_load']) + next_device_total_weight = int(exchange_device_result['total_load']) + max_weight = max(cur_device_total_weight, next_device_total_weight) + + cur_exchange_index = -1 + next_exchange_index = -1 + + redun = False + if num_redundancy_expert != 0: + redun = True + + for index, weight in enumerate(cur_device_weight): + for next_index, next_weight in enumerate(next_device_weight): + # 跳过冗余专家 + if (index == 0 or next_index == 0) and redun : + continue + # 交换专家限制卡内专家不同 + change_flag = True + if cur_device_expert_id[index] in next_device_expert_id or next_device_expert_id[next_index] in cur_device_expert_id: + change_flag = False + # 选择的专家不能是参与过交换的 + if (cur_device_expert_id[index] not in cur_exchanged_expert_id) and ( + next_device_expert_id[next_index] not in next_exchanged_expert_id) and change_flag: + cur_total_weight_after_exchange = cur_device_total_weight - weight + next_weight + next_total_weight_after_exchange = next_device_total_weight - next_weight + weight + exchange_max_weight = max(cur_total_weight_after_exchange, next_total_weight_after_exchange) + if exchange_max_weight < max_weight and (max_weight - exchange_max_weight) >= ( + ave_workload * increment): + max_weight = exchange_max_weight + cur_exchange_index = index + next_exchange_index = next_index + + return cur_exchange_index, next_exchange_index + + @staticmethod + def expert_exchange_between_devices(self, ave_workload, increment, cur_layer_result, com_between_devices, num_redundancy_expert, + node_idx=0, + per_node_device_num=0, is_node_redundant=False): + + if is_node_redundant: + # 拿出当前节点内设备的信息 + cur_devices_result = cur_layer_result[node_idx * per_node_device_num:(node_idx + 1) * per_node_device_num] + else: + # 拿取所有设备信息 + cur_devices_result = cur_layer_result + + devices_total_weight = [] + for device in cur_devices_result: + devices_total_weight.append((int(device['total_load']), device['device_id'] - 1)) + + # 当迭代次数超过100或负载最大的设备无法进行调整时退出 + exchange_frequency = 100 + while exchange_frequency > 0: + exchange_frequency -= 1 + + # 根据负载从小到大排序 + devices_total_weight.sort(key=lambda x: x[0]) + # 负载最大的设备id + max_weight_device_id = devices_total_weight[-1][1] + + exchange = False + # 按照负载从小到大依次取卡 + for index in range(0, len(devices_total_weight) - 1): + min_weight_device_id = devices_total_weight[index][1] + # 两个节点没有进行过通信 + if min_weight_device_id not in com_between_devices[max_weight_device_id]: + # 找到设备中交换过的专家id,(除了冗余之外通信过的id) + set_cur_com_expert_id = set(com_between_devices[max_weight_device_id].values()) + set_next_com_expert_id = set(com_between_devices[min_weight_device_id].values()) + if num_redundancy_expert != 0: + set_cur_device_expert_id = set(cur_layer_result[max_weight_device_id]['assigned_experts'][1:]) + set_next_device_expert_id = set(cur_layer_result[min_weight_device_id]['assigned_experts'][1:]) + else: + set_cur_device_expert_id = set(cur_layer_result[max_weight_device_id]['assigned_experts']) + set_next_device_expert_id = set(cur_layer_result[min_weight_device_id]['assigned_experts']) + + cur_exchanged_expert_id = set_cur_com_expert_id & set_cur_device_expert_id + next_exchanged_expert_id = set_next_com_expert_id & set_next_device_expert_id + + cur_exchange_index, next_exchange_index = self.two_device_exchange_experts( + cur_layer_result[max_weight_device_id], + cur_layer_result[min_weight_device_id], + cur_exchanged_expert_id, + next_exchanged_expert_id, + ave_workload, + increment, + num_redundancy_expert) + + # 有符合条件的专家进行交换 + if cur_exchange_index != -1: + self.exchange_expert(cur_exchange_index, + next_exchange_index, + max_weight_device_id, + min_weight_device_id, + cur_layer_result, + com_between_devices) + + devices_total_weight[-1] = ( + cur_layer_result[max_weight_device_id]['total_load'], max_weight_device_id) + devices_total_weight[index] = ( + cur_layer_result[min_weight_device_id]['total_load'], min_weight_device_id) + exchange = True + break + + if not exchange: + break + + @staticmethod + def exchange_experts(self, layer_result, layer_com_between_devices, num_nodes, device_num, is_node_redundant, + ave_workload, increment, num_redundancy_expert): + + global_deployment = [] + + if is_node_redundant: + per_node_device_num = device_num // num_nodes + for node_idx in range(num_nodes): + self.expert_exchange_between_devices(self, ave_workload, increment, layer_result, + layer_com_between_devices, num_redundancy_expert, + node_idx, per_node_device_num, is_node_redundant) + else: + self.expert_exchange_between_devices(self, ave_workload, increment, layer_result, layer_com_between_devices, num_redundancy_expert) + + max_workload = 0 + for box in layer_result: + global_deployment.append(box['assigned_experts']) + if max_workload < box['total_load']: + max_workload = box['total_load'] + + global_deployment = np.array(global_deployment) + + return global_deployment, max_workload + + @staticmethod + def count_elements(self, lst): + count = 0 + for item in lst: + if isinstance(item, list): + count += self.count_elements(self, item) + else: + count += 1 + return count + + def rebalance_experts(self, current_expert_table, expert_workload): + + info = DynamicTable() + info.workload_table = np.array(expert_workload) + info.placement_table = np.array(current_expert_table) + layer_num, num_npus, experts_per_npu = info.workload_table.shape + expert_ids, counts = np.unique(info.placement_table[0], return_counts=True) + num_redundancy_expert = self.get_redundant_num(num_npus, counts) + num_original_expert = len(expert_ids) + layer_workloads = self.add_redundant(info.placement_table, info.workload_table, num_original_expert) + max_heat_per_layer_before = self.calculate_max_heat_per_layer(info.workload_table, layer_num) + npu_heat_all_origin = sum(max_heat_per_layer_before) + + # 计算负载均衡,部署冗余专家 + num_node = num_npus / 8 + layer_num = layer_workloads.shape[0] + expert_num = layer_workloads.shape[1] + expert_from_device = np.zeros((layer_num, num_original_expert)) + # 校验专家数量、卡数量、冗余专家数量不能超过卡数量 + if num_original_expert != expert_num: + raise ValueError(f"原始专家数量 {num_original_expert} 必须等于 expert_num {expert_num}") + + if num_npus <= 0: + raise ValueError("NPUs 数量必须大于 0") + + if num_npus < num_redundancy_expert: + raise ValueError(f"NPUs 数量 {num_npus} 必须大于或等于冗余专家数量 {num_redundancy_expert}") + + # 每个卡部署的专家数量 一个冗余专家 + global_deployment = [[[] for _ in range(num_npus)] for _ in range(layer_num)] + # 统计更换数据集后的初始58层不均衡度 + layer_initial_imbalance = self.calculate_initial_imbalance(current_expert_table, layer_workloads) + # 遍历获得每一层的放置策略,考虑计算均衡 + max_heat_per_layer_after = np.zeros([layer_num]) + sum_num = 0 + for layer in range(layer_num): + # 不均衡度小于特定阈值不调整 + if layer_initial_imbalance[layer] < 1.1: + global_deployment[layer] = current_expert_table[layer] + continue + + ave_workload = np.sum(layer_workloads[layer]) / num_npus + for device_id, device in enumerate(current_expert_table[layer]): + for index, expert_id in enumerate(device): + if index != 0: + expert_from_device[layer][expert_id] = device_id + + # 调整冗余专家 + result, max_workload, com_between_devices = self.redundant_expert_deployment(self, layer_workloads[layer], + current_expert_table[layer], + expert_from_device[layer], + num_node, False) + # 交换专家 + global_deployment[layer], new_max_workload = self.exchange_experts(self, result, com_between_devices, + num_node, num_npus, False, ave_workload, + 0.05, num_redundancy_expert) + + for device_id in range(num_npus): + com_between_devices[device_id] = {int(key): int(value) for key, value in + com_between_devices[device_id].items()} + sum_num += self.count_elements(self, com_between_devices[device_id]) + + max_heat_per_layer_after[layer] = max(result, key=lambda x: x['total_load'])['total_load'] + + # 获取层优先级 + layer_changed_ratio = [] + for layer_idx in range(layer_num): + layer_changed_ratio.append(max_heat_per_layer_after[layer_idx] / max_heat_per_layer_before[layer_idx]) + + per_layer_priority = np.argsort(layer_changed_ratio) + npu_heat_all_after = sum(max_heat_per_layer_after) + + change = 0 + if npu_heat_all_after < 0.95 * npu_heat_all_origin: + change = 1 + + return change, per_layer_priority, np.array(global_deployment).tolist() + + + + From 771d4c70ccaf96963055288845f26b17c04b3b4e Mon Sep 17 00:00:00 2001 From: NPUDev Date: Sat, 21 Jun 2025 16:26:15 +0800 Subject: [PATCH 114/153] fix dummy_run and profile_run --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 24 +++++++++++++----------- vllm_ascend/worker/model_runner_v1.py | 13 +++++++++++-- vllm_ascend/worker/worker_v1.py | 2 +- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 111cd4adc6..9f06a8af75 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -127,21 +127,23 @@ def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path): for layer_idx in range(num_moe_layers): self.expert_map_per_layer_cpu[layer_idx] = \ expert_map_tensor[layer_idx][self.rank_id] + return expert_map_tensor def _expert_file_to_tensor(self, expert_map_path: str): with open(expert_map_path, "r") as f: data = json.load(f) - layers_num = data["moe_layer_count"] - gpus_num = data["layer_list"][0]["device_count"] - - tensor_data = [] - for layer in data["layer_list"]: - device_data = [] - for device in layer["device_list"]: - device_data.append(device["device_expert"]) - tensor_data.append(device_data) - expert_map_tensor = torch.tensor(tensor_data, dtype=torch.int32) - return expert_map_tensor, layers_num, gpus_num + layers_num = data["moe_layer_count"] + gpus_num = data["layer_list"][0]["device_count"] + + tensor_data = [] + for layer in data["layer_list"]: + device_data = [] + for device in layer["device_list"]: + device_data.append(device["device_expert"]) + tensor_data.append(device_data) + expert_map_tensor = torch.tensor(tensor_data, dtype=torch.int32) + return expert_map_tensor, layers_num, gpus_num + logger.error(f"failed to read expert_map_path: {expert_map_path}") def do_update_expert_map(self, layer_id, updated_expert_map): self.expert_map_per_layer[layer_id].copy_(updated_expert_map) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 03e689b1fc..4d5295343d 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -370,6 +370,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.dynamic_eplb = ascend_config.dynamic_eplb if self.dynamic_eplb == True: self.eplb_adaptor = None + self.is_eplb_warmuped = False self.eplb_updator = EplbUpdator(ascend_config.expert_map_path) def _update_states(self, scheduler_output: "SchedulerOutput") -> None: @@ -1457,6 +1458,7 @@ def _dummy_run( num_tokens: int, is_compile: bool = False, with_prefill: bool = True, + is_profile_run: bool = False, ) -> torch.Tensor: # Set num_scheduled_tokens based on num_tokens and max_num_seqs # for dummy run with LoRA so that the num_reqs collectively @@ -1471,6 +1473,10 @@ def _dummy_run( assert len(num_scheduled_tokens_list) == num_reqs num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32) + + if not is_compile and not is_profile_run and self.dynamic_eplb: + self.eplb_updator.forward_before() + with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens): model = self.model @@ -1537,6 +1543,8 @@ def _dummy_run( intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds) + if not is_compile and not is_profile_run and self.dynamic_eplb: + self.eplb_updator.forward_end() return hidden_states def profile_run(self) -> None: @@ -1565,7 +1573,7 @@ def profile_run(self) -> None: # TODO: call maybe_profile_with_lora() # Trigger compilation for general shape. - hidden_states = self._dummy_run(self.max_num_tokens) + hidden_states = self._dummy_run(num_tokens = self.max_num_tokens, is_profile_run=True) if get_pp_group().is_last_rank: hidden_states = hidden_states[logit_indices] @@ -1583,7 +1591,8 @@ def do_get_expert_load(self) -> str: def eplb_warmup(self): #EPLB - if self.dynamic_eplb == True: + if self.dynamic_eplb and not self.is_eplb_warmuped: + self.is_eplb_warmuped = True self.eplb_adaptor = VllmEplbAdaptor(model=self.model) self.eplb_updator.set_adaptor(self.eplb_adaptor) self.eplb_updator.warm_up_eplb() diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 16e1bec219..1df2d7a265 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -192,6 +192,7 @@ def load_model(self) -> None: self.model_runner.load_model() def compile_or_warm_up_model(self) -> None: + self.model_runner.eplb_warmup() warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() if not self.model_config.enforce_eager: warmup_sizes = [ @@ -202,7 +203,6 @@ def compile_or_warm_up_model(self) -> None: logger.info("Compile and warming up model for size %d", size) self.model_runner._dummy_run(size) - self.model_runner.eplb_warmup() if not self.model_config.enforce_eager: self.model_runner.capture_model() # Reset the seed to ensure that the random state is not affected by From da27c2d91baaac6de1dc948c9d163484032d5ad9 Mon Sep 17 00:00:00 2001 From: lt Date: Sat, 21 Jun 2025 16:54:10 +0800 Subject: [PATCH 115/153] add mock experts_load data --- vllm_ascend/eplb/eplb_updator.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 3a237d3ac3..168d1b79fb 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -214,9 +214,20 @@ def unpack_update_batch(self, packed_update_info): return recovered def get_expert_load(self) -> str: - """todo 确认moe_load的值是什么类型""" - # return '{"a":"b"}' # mock - return self.shared_dict['moe_load'] + + # todo wjh 给到返回值 + # return self.shared_dict['moe_load'] + # mock json_str + experts_load = ('{\"expert_load\":[' + '{\"ip\":\"141.xxx.xxx.181\",' + '\"node_0\":' + '{\"card_0\":' + '[{\"layer_4\":{\"expert_0\":3,\"expert_2\":1}},{\"layer_5\":{\"expert_0\":3,\"expert_2\":1}}],' + '\"card_1\":[{\"layer_4\":{\"expert_1\":3,\"expert_3\":1},\"layer_5\":{\"expert_0\":3,\"' + 'expert_2\":1}}]}},{\"ip\":\"141.xxx.xxx.177\",\"node_0\":{\"card_0\":[{\"layer_4\":' + '{\"expert_0\":3,\"expert_2\":1}},{\"layer_5\":{\"expert_0\":3,\"expert_2\":1}}],' + '\"card_1\":[{\"layer_4\":{\"expert_1\":3,\"expert_3\":1}}]}}]}') + return experts_load def shutdown(self): """ From 70a922ed25a6a452e08b24ae71f103fd3536ceac Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Sat, 21 Jun 2025 18:07:09 +0800 Subject: [PATCH 116/153] fix bugs in get_init_expert_map_from_file --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 31 ++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 9f06a8af75..1c86cd8bee 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -122,12 +122,39 @@ def get_init_expert_map(self, num_moe_layers): return all_expert_maps + def local2global(self, + placement_local: torch.Tensor + ) -> torch.Tensor: + + L, G, E_local = placement_local.shape + device = placement_local.device + + max_id = torch.max(placement_local) + E_global = (max_id + 1).item() if max_id >= 0 else 0 + + if E_global == 0: + return torch.empty((L, G, 0), dtype=torch.long, device=device) + + placement_global = torch.full((L, G, E_global), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement_local >= 0 + l_idx, g_idx, slot_idx = valid.nonzero(as_tuple=True) + gid_idx = placement_local[l_idx, g_idx, slot_idx] + + placement_global[l_idx, g_idx, gid_idx] = slot_idx + + return placement_global + def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path): expert_map_tensor, layers_num, ranks_num = self._expert_file_to_tensor(expert_map_path) + expert_map_all = self.local2global(expert_map_tensor) for layer_idx in range(num_moe_layers): - self.expert_map_per_layer_cpu[layer_idx] = \ + self.expert_map_per_layer_cpu[self.num_dense_layers + layer_idx] = \ expert_map_tensor[layer_idx][self.rank_id] - return expert_map_tensor + return expert_map_all def _expert_file_to_tensor(self, expert_map_path: str): with open(expert_map_path, "r") as f: From af31373f0a25d7f63003ee03597376e5ea95242a Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Sat, 21 Jun 2025 19:29:43 +0800 Subject: [PATCH 117/153] fix bug in init expert_map_per_layer_cpu --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 1c86cd8bee..bcccd420f0 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -153,7 +153,7 @@ def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path): expert_map_all = self.local2global(expert_map_tensor) for layer_idx in range(num_moe_layers): self.expert_map_per_layer_cpu[self.num_dense_layers + layer_idx] = \ - expert_map_tensor[layer_idx][self.rank_id] + expert_map_all[layer_idx][self.rank_id] return expert_map_all def _expert_file_to_tensor(self, expert_map_path: str): From 613c03063867008c63b01aa0ec6621424164ff6b Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Sat, 21 Jun 2025 20:26:42 +0800 Subject: [PATCH 118/153] add gate_eplb --- vllm_ascend/eplb/eplb_updator.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 3a237d3ac3..62ff828e7e 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -42,6 +42,8 @@ def init_eplb(self, expert_map_path): self.expert_map_initialized = False self.update_in_flight = False + self.gate_eplb = True + self.reqs = [] self.update_info_all = [] @@ -78,7 +80,10 @@ def init_eplb(self, expert_map_path): def get_update_iteration(self): self.cur_iterations = self.cur_iterations + 1 - return self.cur_iterations % self.num_iterations == 0 + if not self.gate_eplb: + return self.cur_iterations % self.num_iterations == 0 + else: + return return self.cur_iterations == self.num_iterations def get_init_expert_map(self): try: From 057a2977d5154c2b5085b24f25e1b4917e1fc2ca Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Sat, 21 Jun 2025 20:44:54 +0800 Subject: [PATCH 119/153] get_init_experts_map in warm up --- vllm_ascend/eplb/eplb_updator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 62ff828e7e..1d61135561 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -97,7 +97,6 @@ def wakeup_eplb_worker(self): self.planner_block_queue.put(1) def forward_before(self): - self.get_init_expert_map() # Batch after eplb process being triggered, get update info provided by eplb process if self.update_in_flight and self.weight_update_counter == 0 and self.wait_worker_iterations == self.num_wait_worker_iterations: @@ -164,6 +163,8 @@ def compute_and_set_moe_load(self): def warm_up_eplb(self): + self.get_init_expert_map() + self.compute_and_set_moe_load() src_tensor = torch.empty((1,), device=self.device) From 9f498e9178e9ea8227c8a8104f721c69e867002c Mon Sep 17 00:00:00 2001 From: lt Date: Sat, 21 Jun 2025 20:52:22 +0800 Subject: [PATCH 120/153] add update_expert_load_statistical_period logic --- vllm_ascend/eplb/eplb_updator.py | 9 ++++++++- vllm_ascend/worker/model_runner_v1.py | 3 +++ vllm_ascend/worker/worker_v1.py | 3 +++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 168d1b79fb..b7997cecff 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -33,7 +33,7 @@ def set_adaptor(self, adaptor): self.num_moe_layers = self.adaptor.num_moe_layers def init_eplb(self, expert_map_path): - + self.num_expert_load_gather = 0 self.redundant_enable = (expert_map_path != None) self.num_iterations: torch.int64 = 130 self.expert_map_path = expert_map_path @@ -229,6 +229,13 @@ def get_expert_load(self) -> str: '\"card_1\":[{\"layer_4\":{\"expert_1\":3,\"expert_3\":1}}]}}]}') return experts_load + def do_update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): + logger.info(f" start update {self.num_expert_load_gather=}, {self.num_iterations}...") + self.num_expert_load_gather = num_expert_load_gather + self.num_iterations = num_iterations + logger.info(f" update {self.num_expert_load_gather=}, {self.num_iterations} success...") + + def shutdown(self): """ Clean up the EPLB process. diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 4d5295343d..37bb5265e1 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1589,6 +1589,9 @@ def profile_run(self) -> None: def do_get_expert_load(self) -> str: return self.eplb_updator.get_expert_load() + def do_update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): + return self.eplb_updator.update_expert_load_statistical_period(num_expert_load_gather, num_iterations) + def eplb_warmup(self): #EPLB if self.dynamic_eplb and not self.is_eplb_warmuped: diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 1df2d7a265..a5a6dcf93b 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -214,6 +214,9 @@ def get_expert_load(self) -> str: moe_load = self.model_runner.do_get_expert_load() return moe_load + def update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): + self.model_runner.do_update_expert_load_statistical_period(num_expert_load_gather, num_iterations) + def get_model(self) -> nn.Module: return self.model_runner.get_model() From 11936d58a44f9a3ca66c537aca84e6225d9da03a Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Sat, 21 Jun 2025 21:06:50 +0800 Subject: [PATCH 121/153] add generate expert map --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 97 +++++++++++++++++++----- 1 file changed, 79 insertions(+), 18 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 512a43821f..a96e498197 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -14,14 +14,16 @@ # limitations under the License. # This file is a part of the vllm-ascend project. # - +import os +import json import torch +import random import torch.distributed as dist import numpy as np from vllm_ascend.eplb.adaptor.abstract_adaptor import EplbAdaptor from vllm.logger import logger -import random + class VllmEplbAdaptor(EplbAdaptor): @@ -29,12 +31,14 @@ class VllmEplbAdaptor(EplbAdaptor): def __init__(self, model, **args): super().__init__(**args) self.model = model - self.rank_id = torch.distributed.get_rank() + self.rank_id = dist.get_rank() + self.world_size = dist.get_world_size() self.param_dict = dict(self.model.named_parameters()) self.num_dense_layers = self.model.config.first_k_dense_replace self.num_moe_layers = self.model.config.num_hidden_layers - self.num_dense_layers self.global_expert_num = self.model.config.n_routed_experts + # TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 is supported here self.expert_weight_names = ["w13_weight", "w2_weight", "w13_weight_scale", "w13_weight_offset", "w2_weight_scale", "w2_weight_offset"] @@ -122,25 +126,31 @@ def get_init_expert_map(self, num_moe_layers): return all_expert_maps def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path): - expert_map_tensor, layers_num, ranks_num = self._expert_file_to_tensor(expert_map_path) + if os.path.exists(expert_map_path): + expert_map_tensor, layers_num, ranks_num = self._expert_file_to_tensor(expert_map_path) + expert_map_all = self.local2global(expert_map_tensor) + else: + expert_map_all = self.determine_expert_map_all() for layer_idx in range(num_moe_layers): - self.expert_map_per_layer_cpu[layer_idx] = \ - expert_map_tensor[layer_idx][self.rank_id] - + self.expert_map_per_layer_cpu[layer_idx+3] = \ + expert_map_all[layer_idx][self.rank_id] + return expert_map_all + def _expert_file_to_tensor(self, expert_map_path: str): with open(expert_map_path, "r") as f: data = json.load(f) - layers_num = data["moe_layer_count"] - gpus_num = data["layer_list"][0]["device_count"] - - tensor_data = [] - for layer in data["layer_list"]: - device_data = [] - for device in layer["device_list"]: - device_data.append(device["device_expert"]) - tensor_data.append(device_data) - expert_map_tensor = torch.tensor(tensor_data, dtype=torch.int32) - return expert_map_tensor, layers_num, gpus_num + layers_num = data["moe_layer_count"] + gpus_num = data["layer_list"][0]["device_count"] + + tensor_data = [] + for layer in data["layer_list"]: + device_data = [] + for device in layer["device_list"]: + device_data.append(device["device_expert"]) + tensor_data.append(device_data) + expert_map_tensor = torch.tensor(tensor_data, dtype=torch.int32) + return expert_map_tensor, layers_num, gpus_num + logger.error(f"failed to read expert_map_path: {expert_map_path}") def do_update_expert_map(self, layer_id, updated_expert_map): self.expert_map_per_layer[layer_id].copy_(updated_expert_map) @@ -156,3 +166,54 @@ def do_update_expert_weight(self, layer_id, local_expert_to_replace, buffer_tens def do_update_log2phy_map(self, layer_id, updated_log2phy_map): if self.log2phy_map_per_layer[layer_id] is not None: self.log2phy_map_per_layer[layer_id].copy_(updated_log2phy_map[self.rank_id]) + + def local2global(self, + placement_local: torch.Tensor + ) -> torch.Tensor: + + L, G, E_local = placement_local.shape + device = placement_local.device + + max_id = torch.max(placement_local) + E_global = (max_id + 1).item() if max_id >= 0 else 0 + + if E_global == 0: + return torch.empty((L, G, 0), dtype=torch.long, device=device) + + placement_global = torch.full((L, G, E_global), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement_local >= 0 + l_idx, g_idx, slot_idx = valid.nonzero(as_tuple=True) + gid_idx = placement_local[l_idx, g_idx, slot_idx] + + placement_global[l_idx, g_idx, gid_idx] = slot_idx + + return placement_global + + def determine_expert_map_all(self): + + local_num_experts = self.global_expert_num // self.world_size + + expert_map_all = torch.full( + (self.num_moe_layers, self.world_size, self.global_expert_num), + -1, + dtype=torch.int32 + ) + + for r in range(self.world_size): + if r < self.world_size - 1: + start = r * local_num_experts + end = (r + 1) * local_num_experts + local_count = local_num_experts + else: + start = r * local_num_experts + end = self.global_expert_num + local_count = self.global_expert_num - r * local_num_experts + + local_ids = torch.arange(local_count, dtype=torch.int32) + expert_map_all[:, r, start:end] = local_ids.unsqueeze(0).expand(self.num_moe_layers, -1) + + return expert_map_all \ No newline at end of file From d0e8104b7aa18412b4ea8ce702c5f1fc800abbee Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Sat, 21 Jun 2025 21:14:54 +0800 Subject: [PATCH 122/153] add generate_expert_map_all --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 28 ++++++++---------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 16a64158a6..3128cd4db7 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -20,7 +20,6 @@ import random import torch.distributed as dist import numpy as np -import json from vllm_ascend.eplb.adaptor.abstract_adaptor import EplbAdaptor from vllm.logger import logger @@ -106,25 +105,16 @@ def get_rank_expert_workload( moe_load.scatter_add_(dim=1, index=ids2d, src=ones2d) return moe_load - def get_init_expert_map(self, num_moe_layers): - expert_map = self.model.get_all_expert_map(num_moe_layers) - if dist.is_initialized(): - world_size = dist.get_world_size() - rank = dist.get_rank() - - gathered = torch.empty((world_size, *expert_map.shape), # [W, L, E] - dtype=expert_map.dtype, - device=expert_map.device) - - dist.all_gather_into_tensor(gathered, expert_map) - all_maps = gathered.permute(1, 0, 2) - all_expert_maps = all_maps.cpu() - + def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path): + if os.path.exists(expert_map_path): + expert_map_tensor, layers_num, ranks_num = self._expert_file_to_tensor(expert_map_path) + expert_map_all = self.local2global(expert_map_tensor) + else: + expert_map_all = self.determine_expert_map_all() for layer_idx in range(num_moe_layers): - self.expert_map_per_layer_cpu[self.num_dense_layers + layer_idx] = \ - all_expert_maps[layer_idx][self.rank_id] - - return all_expert_maps + self.expert_map_per_layer_cpu[layer_idx+3] = \ + expert_map_all[layer_idx][self.rank_id] + return expert_map_all def local2global(self, placement_local: torch.Tensor From 0c8318cd2f5eb587975857ee6df11e0730830dfd Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Sat, 21 Jun 2025 21:16:32 +0800 Subject: [PATCH 123/153] generate expert map --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 27 ++++++++++++++++-------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 3128cd4db7..fc17ed3474 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -105,16 +105,25 @@ def get_rank_expert_workload( moe_load.scatter_add_(dim=1, index=ids2d, src=ones2d) return moe_load - def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path): - if os.path.exists(expert_map_path): - expert_map_tensor, layers_num, ranks_num = self._expert_file_to_tensor(expert_map_path) - expert_map_all = self.local2global(expert_map_tensor) - else: - expert_map_all = self.determine_expert_map_all() + def get_init_expert_map(self, num_moe_layers): + expert_map = self.model.get_all_expert_map(num_moe_layers) + if dist.is_initialized(): + world_size = dist.get_world_size() + rank = dist.get_rank() + + gathered = torch.empty((world_size, *expert_map.shape), # [W, L, E] + dtype=expert_map.dtype, + device=expert_map.device) + + dist.all_gather_into_tensor(gathered, expert_map) + all_maps = gathered.permute(1, 0, 2) + all_expert_maps = all_maps.cpu() + for layer_idx in range(num_moe_layers): - self.expert_map_per_layer_cpu[layer_idx+3] = \ - expert_map_all[layer_idx][self.rank_id] - return expert_map_all + self.expert_map_per_layer_cpu[self.num_dense_layers + layer_idx] = \ + all_expert_maps[layer_idx][self.rank_id] + + return all_expert_maps def local2global(self, placement_local: torch.Tensor From ab4bfd2c9af76ba8a79382b267140723c7b958cf Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Sat, 21 Jun 2025 21:19:15 +0800 Subject: [PATCH 124/153] init expert map --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index fc17ed3474..a0b26e8e2a 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -152,10 +152,13 @@ def local2global(self, return placement_global def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path): - expert_map_tensor, layers_num, ranks_num = self._expert_file_to_tensor(expert_map_path) - expert_map_all = self.local2global(expert_map_tensor) + if os.path.exists(expert_map_path): + expert_map_tensor, layers_num, ranks_num = self._expert_file_to_tensor(expert_map_path) + expert_map_all = self.local2global(expert_map_tensor) + else: + expert_map_all = self.determine_expert_map_all() for layer_idx in range(num_moe_layers): - self.expert_map_per_layer_cpu[self.num_dense_layers + layer_idx] = \ + self.expert_map_per_layer_cpu[layer_idx+3] = \ expert_map_all[layer_idx][self.rank_id] return expert_map_all From e6e25f33cd564ad18c1b0a8c341f9d0c0c1b2dff Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Sat, 21 Jun 2025 21:22:47 +0800 Subject: [PATCH 125/153] fix bugs in get_update_iteration --- vllm_ascend/eplb/eplb_updator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 1d61135561..f8f12d54ed 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -83,7 +83,7 @@ def get_update_iteration(self): if not self.gate_eplb: return self.cur_iterations % self.num_iterations == 0 else: - return return self.cur_iterations == self.num_iterations + return self.cur_iterations == self.num_iterations def get_init_expert_map(self): try: From 353150e67b2bdf8a40cbd8fdbb09d8dae9cacebe Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Sat, 21 Jun 2025 22:31:51 +0800 Subject: [PATCH 126/153] fix bug in get_init_expert_map_from_file --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index a0b26e8e2a..6d6cea7472 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -152,11 +152,13 @@ def local2global(self, return placement_global def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path): - if os.path.exists(expert_map_path): + + try: expert_map_tensor, layers_num, ranks_num = self._expert_file_to_tensor(expert_map_path) expert_map_all = self.local2global(expert_map_tensor) - else: + except (TypeError, FileNotFoundError, OSError): expert_map_all = self.determine_expert_map_all() + for layer_idx in range(num_moe_layers): self.expert_map_per_layer_cpu[layer_idx+3] = \ expert_map_all[layer_idx][self.rank_id] From f6830d40535438b11bf2bfce2d40ef681c72ec1f Mon Sep 17 00:00:00 2001 From: lt Date: Sun, 22 Jun 2025 12:06:48 +0800 Subject: [PATCH 127/153] update policy = 6 --- vllm_ascend/eplb/eplb_updator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index f8f12d54ed..7d0f963776 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -70,7 +70,7 @@ def init_eplb(self, expert_map_path): planner_q = self.planner_block_queue, block_update_q = self.block_update_queue, redundant_enable = self.redundant_enable, - policy_type = 2, + policy_type = 6, enable_d2d = True ) From 041e141a5bced20bf97458e69c1d5fd1d6a30428 Mon Sep 17 00:00:00 2001 From: raindaywhu Date: Sun, 22 Jun 2025 12:16:47 +0800 Subject: [PATCH 128/153] add load_gather_iteration --- vllm_ascend/eplb/eplb_updator.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index b4d872a5ab..8bb0c3d837 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -16,6 +16,7 @@ # import torch import torch.distributed as dist +import vllm.envs as envs from multiprocessing import Queue, Manager from vllm.logger import logger @@ -33,11 +34,17 @@ def set_adaptor(self, adaptor): self.num_moe_layers = self.adaptor.num_moe_layers def init_eplb(self, expert_map_path): - self.num_expert_load_gather = 0 + self.num_expert_load_gather = 10 self.redundant_enable = (expert_map_path != None) self.num_iterations: torch.int64 = 130 self.expert_map_path = expert_map_path + try: + if not envs.VLLM_ALLOW_EXPERT_LOAD_COLLECTING: + self.num_expert_load_gather = self.num_iterations + except Exception as e: + self.num_expert_load_gather = self.num_iterations + self.weight_update_counter = 0 self.expert_map_initialized = False self.update_in_flight = False @@ -80,10 +87,9 @@ def init_eplb(self, expert_map_path): def get_update_iteration(self): self.cur_iterations = self.cur_iterations + 1 - if not self.gate_eplb: - return self.cur_iterations % self.num_iterations == 0 - else: - return self.cur_iterations == self.num_iterations + load_gather_iteration = self.cur_iterations % self.num_expert_load_gather == 0 if not self.gate_eplb else self.cur_iterations == self.num_iterations + upate_iteration = self.cur_iterations % self.num_iterations == 0 if not self.gate_eplb else self.cur_iterations == self.num_iterations + return load_gather_iteration, upate_iteration def get_init_expert_map(self): try: @@ -125,12 +131,15 @@ def forward_before(self): self.eplb_loader.asyn_expert_weight_transfer(self.reqs) def forward_end(self): - if not self.update_in_flight and self.get_update_iteration(): - moe_load = self.compute_and_set_moe_load() - self.wakeup_eplb_worker() - self.update_in_flight = True - self.wait_worker_iterations = 0 - self.weight_loading = False + if not self.update_in_flight: + load_gather_iteration, update_iteration = self.get_update_iteration() + if load_gather_iteration: + self.moe_load = self.compute_and_set_moe_load() + if update_iteration: + self.wakeup_eplb_worker() + self.update_in_flight = True + self.wait_worker_iterations = 0 + self.weight_loading = False if self.update_in_flight: self.wait_worker_iterations = self.wait_worker_iterations + 1 From f4f9fd766f9b2aa8967b22b31b47c1e3a38ecfe4 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Sun, 22 Jun 2025 12:48:40 +0800 Subject: [PATCH 129/153] add code to guarantee there is no expert movement inside a NPU --- vllm_ascend/eplb/core/policy/dynamic_ep_v2.py | 74 +++++-------------- .../eplb/core/policy/policy_factory.py | 11 ++- 2 files changed, 27 insertions(+), 58 deletions(-) diff --git a/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py b/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py index 49eeb4faeb..783e04cbf0 100644 --- a/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py +++ b/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py @@ -307,61 +307,6 @@ def calculate_initial_imbalance(global_deployment, new_layer_workloads): return layer_imbalance - def rebalance_experts(self, current_expert_table, expert_workload): - - info = DynamicTable() - info.workload_table = np.array(expert_workload) - info.placement_table = np.array(current_expert_table) - layer_num, num_npus, experts_per_npu = info.workload_table.shape - expert_ids, counts = np.unique(info.placement_table[0], return_counts=True) - num_redundancy_expert = self.get_redundant_num(num_npus, counts) - num_original_expert = len(expert_ids) - layer_workloads = self.add_redundant(info.placement_table, info.workload_table, num_original_expert) - max_heat_per_layer_before = self.calculate_max_heat_per_layer(info.workload_table, layer_num) - npu_heat_all_origin = sum(max_heat_per_layer_before) - - # 计算负载均衡,部署冗余专家 - layer_num = layer_workloads.shape[0] - expert_num = layer_workloads.shape[1] - # 校验专家数量、卡数量、冗余专家数量不能超过卡数量 - if num_original_expert != expert_num: - raise ValueError(f"原始专家数量 {num_original_expert} 必须等于 expert_num {expert_num}") - - if num_npus <= 0: - raise ValueError("NPUs 数量必须大于 0") - - if num_npus < num_redundancy_expert: - raise ValueError(f"NPUs 数量 {num_npus} 必须大于或等于冗余专家数量 {num_redundancy_expert}") - - # 每个卡部署的专家数量 一个冗余专家 - global_deployment = [[[] for _ in range(num_npus)] for _ in range(layer_num)] - # 遍历获得每一层的放置策略,考虑计算均衡 - max_heat_per_layer_after = np.zeros([layer_num]) - for layer in range(layer_num): - # 获取当前层专家ID和对应负载,负载需要进行正则化处理, 每个卡加一个冗余专家 - weights = np.zeros((expert_num,), dtype='object') - for expert_id, workload_weight in enumerate(layer_workloads[layer]): - weights[expert_id] = (expert_id, workload_weight) - - # 获取每一层全局计算均衡的放置策略 - result, layer_deployment = self.original_compute_balanced_pack_redundancy( - weights, num_npus, num_redundancy_expert - ) - global_deployment[layer] = layer_deployment - max_heat_per_layer_after[layer] = max(result, key=lambda x: x['total_weight'])['total_weight'] - - # 获取层优先级 - layer_changed_ratio = [] - for layer_idx in range(layer_num): - layer_changed_ratio.append(max_heat_per_layer_after[layer_idx] / max_heat_per_layer_before[layer_idx]) - - per_layer_priority = np.argsort(layer_changed_ratio) - npu_heat_all_after = sum(max_heat_per_layer_after) - - change = 0 - - return change, per_layer_priority, np.array(global_deployment).tolist() - @staticmethod def compute_redundant_assignments(base_experts, num_redundant_experts, num_experts): """ @@ -845,6 +790,25 @@ def rebalance_experts(self, current_expert_table, expert_workload): num_node, num_npus, False, ave_workload, 0.05, num_redundancy_expert) + # To guarantee there is no expert movement inside a NPU + start_physical_idx = 1 if num_redundancy_expert else 0 + for rank in range(num_npus): + physical_expert = start_physical_idx + while physical_expert in range(start_physical_idx, experts_per_npu): + # skip the expert which is moved into this rank + if global_deployment[layer][rank][physical_expert] not in current_expert_table[layer, rank, :]: + physical_expert += 1 + continue + + if global_deployment[layer][rank][physical_expert] != current_expert_table[layer][rank][physical_expert]: + right_idx = np.where(current_expert_table[layer][rank] == global_deployment[layer][rank][physical_expert])[0][0] + # exchange expert with the expert on the right physical index + tempt = global_deployment[layer][rank][right_idx] + global_deployment[layer][rank][right_idx] = global_deployment[layer][rank][physical_expert] + global_deployment[layer][rank][physical_expert] = tempt + else: + physical_expert += 1 + for device_id in range(num_npus): com_between_devices[device_id] = {int(key): int(value) for key, value in com_between_devices[device_id].items()} diff --git a/vllm_ascend/eplb/core/policy/policy_factory.py b/vllm_ascend/eplb/core/policy/policy_factory.py index be0ff0323c..daa5e2ed55 100644 --- a/vllm_ascend/eplb/core/policy/policy_factory.py +++ b/vllm_ascend/eplb/core/policy/policy_factory.py @@ -10,14 +10,19 @@ class PolicyFactory: @staticmethod def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy: policy = { + """ + Constraint applying Dynamic EPLB policy V2: + If there exists redundant expert, + only one redundant expert can be placed in one NPU and its physical expert index must be 0 + """ # Applying bipartite d2d expert weight update composing 0:MockLoadBalance, # MockLoadBalance 1:DynamicEplb, # Dynamic EPLB policy 2:DynamicEplbV2, # Dynamic EPLB policy V2 # Applying greedy d2d expert weight update composing - 4:MockLoadBalance, # MockLoadBalance - 5:DynamicEplb, # Dynamic EPLB policy - 6:DynamicEplbV2, # Dynamic EPLB policy + 3:MockLoadBalance, # MockLoadBalance + 4:DynamicEplb, # Dynamic EPLB policy + 5:DynamicEplbV2, # Dynamic EPLB policy V2 } return policy.get(policy_type, MockLoadBalance)(config) From 5c09eabbfe2ae5a4e61a334110fbe61f239d5764 Mon Sep 17 00:00:00 2001 From: lt Date: Sun, 22 Jun 2025 14:18:04 +0800 Subject: [PATCH 130/153] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E6=97=A5=E5=BF=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_ascend/eplb/eplb_updator.py | 2 +- vllm_ascend/worker/worker_v1.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index b4d872a5ab..72ee4a4d3f 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -70,7 +70,7 @@ def init_eplb(self, expert_map_path): planner_q = self.planner_block_queue, block_update_q = self.block_update_queue, redundant_enable = self.redundant_enable, - policy_type = 2, + policy_type = 6, enable_d2d = True ) diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index a5a6dcf93b..2471770ecc 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -215,6 +215,7 @@ def get_expert_load(self) -> str: return moe_load def update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): + logger.info("lt - ascend begin update update_expert_load_statistical_period") self.model_runner.do_update_expert_load_statistical_period(num_expert_load_gather, num_iterations) def get_model(self) -> nn.Module: From 017e0aa60edb2095ad97344bc479c1754b16d2d8 Mon Sep 17 00:00:00 2001 From: wanghanqingLYT Date: Sun, 22 Jun 2025 14:20:51 +0800 Subject: [PATCH 131/153] Update policy_factory.py fix invalid syntax in Update policy_factory.py --- vllm_ascend/eplb/core/policy/policy_factory.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/eplb/core/policy/policy_factory.py b/vllm_ascend/eplb/core/policy/policy_factory.py index daa5e2ed55..7ebd048ff9 100644 --- a/vllm_ascend/eplb/core/policy/policy_factory.py +++ b/vllm_ascend/eplb/core/policy/policy_factory.py @@ -10,11 +10,10 @@ class PolicyFactory: @staticmethod def generate_policy(policy_type: int, config: DynamicConfig) -> EplbPolicy: policy = { - """ - Constraint applying Dynamic EPLB policy V2: - If there exists redundant expert, - only one redundant expert can be placed in one NPU and its physical expert index must be 0 - """ + # Constraint applying Dynamic EPLB policy V2: + # If there exists redundant expert: + # only one redundant expert can be placed in one NPU and its physical expert index must be 0 + # Applying bipartite d2d expert weight update composing 0:MockLoadBalance, # MockLoadBalance 1:DynamicEplb, # Dynamic EPLB policy From d537fb212ddd3546c6d1444322ab2fc85e255190 Mon Sep 17 00:00:00 2001 From: lt Date: Sun, 22 Jun 2025 16:46:08 +0800 Subject: [PATCH 132/153] update --- vllm_ascend/eplb/eplb_updator.py | 2 +- vllm_ascend/worker/worker_v1.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 2e801e5451..a04492669f 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -244,7 +244,7 @@ def get_expert_load(self) -> str: '\"card_1\":[{\"layer_4\":{\"expert_1\":3,\"expert_3\":1}}]}}]}') return experts_load - def do_update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): + def update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): logger.info(f" start update {self.num_expert_load_gather=}, {self.num_iterations}...") self.num_expert_load_gather = num_expert_load_gather self.num_iterations = num_iterations diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 2471770ecc..a5a6dcf93b 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -215,7 +215,6 @@ def get_expert_load(self) -> str: return moe_load def update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): - logger.info("lt - ascend begin update update_expert_load_statistical_period") self.model_runner.do_update_expert_load_statistical_period(num_expert_load_gather, num_iterations) def get_model(self) -> nn.Module: From 9e2cca1571addd53813c3f7df1bdce814a6478f8 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Sun, 22 Jun 2025 17:11:11 +0800 Subject: [PATCH 133/153] dummy run not add moe load --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 35 ++++++++++++++++-------- vllm_ascend/eplb/eplb_updator.py | 21 ++++++-------- vllm_ascend/worker/model_runner_v1.py | 3 +- 3 files changed, 34 insertions(+), 25 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 6d6cea7472..9d4db1fb80 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -85,25 +85,36 @@ def init_expert_param_per_layer(self): def get_rank_expert_workload( self, num_moe_layers: int, + dummy_run = False ) -> torch.Tensor: - # 收集各层 topk_ids -> list of [B, K] + all_topk_ids = [self.model.get_topk_ids(i) for i in range(num_moe_layers)] - # stack & flatten -> ids2d: [L, B*K] - stacked = torch.stack(all_topk_ids, dim=0) # [L, B, K] + stacked = torch.stack(all_topk_ids, dim=0) L, B, K = stacked.shape - ids2d = stacked.view(L, B * K).to(torch.int64) # [L, N] + N = B * K + device = stacked.device + G = self.global_expert_num + + if not hasattr(self, "cum_moe_load") or self.cum_moe_load is None: + self.cum_moe_load = torch.zeros((L, G), + dtype=torch.int64, + device=device) + + if dummy_run: + return self.cum_moe_load + + ids1d = stacked.view(-1).to(torch.int64) - device = ids2d.device - moe_load = torch.zeros((L, self.global_expert_num), - dtype=torch.int64, device=device) + row_idx = torch.arange(L, device=device).repeat_interleave(N) - ones2d = torch.ones_like(ids2d, dtype=torch.int64) + combined = row_idx * G + ids1d - assert moe_load.dim() == 2 and ids2d.dim() == 2 and ones2d.dim() == 2 - assert ids2d.shape == ones2d.shape + counts = torch.bincount(combined, minlength=L * G) + workload = counts.view(L, G) - moe_load.scatter_add_(dim=1, index=ids2d, src=ones2d) - return moe_load + self.cum_moe_load.add_(workload) + + return self.cum_moe_load def get_init_expert_map(self, num_moe_layers): expert_map = self.model.get_all_expert_map(num_moe_layers) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index a04492669f..2722a2da98 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -130,16 +130,14 @@ def forward_before(self): self.reqs = [] self.eplb_loader.asyn_expert_weight_transfer(self.reqs) - def forward_end(self): - if not self.update_in_flight: - load_gather_iteration, update_iteration = self.get_update_iteration() - if load_gather_iteration: - self.moe_load = self.compute_and_set_moe_load() - if update_iteration: - self.wakeup_eplb_worker() - self.update_in_flight = True - self.wait_worker_iterations = 0 - self.weight_loading = False + def forward_end(self, dummy_run=False): + self.adaptor.get_rank_expert_workload(self.num_moe_layers) + if not self.update_in_flight and self.get_update_iteration(): + moe_load = self.compute_and_set_moe_load(dummy_run) + self.wakeup_eplb_worker() + self.update_in_flight = True + self.wait_worker_iterations = 0 + self.weight_loading = False if self.update_in_flight: self.wait_worker_iterations = self.wait_worker_iterations + 1 @@ -148,7 +146,6 @@ def forward_end(self): def compute_and_set_moe_load(self): local_load = self.adaptor.get_rank_expert_workload(self.num_moe_layers) - self._gather_buffer = None if dist.is_initialized(): self.world_size = dist.get_world_size() @@ -161,7 +158,7 @@ def compute_and_set_moe_load(self): dist.all_gather_into_tensor(self._gather_buffer, local_load) - moe_load = self._gather_buffer.permute(1, 0, 2).contiguous() + moe_load = self._gather_buffer.permute(1, 0, 2) self.shared_dict["moe_load"] = moe_load.cpu() logger.debug(f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}") else: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 37bb5265e1..70fdea6f02 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1544,7 +1544,8 @@ def _dummy_run( inputs_embeds=inputs_embeds) if not is_compile and not is_profile_run and self.dynamic_eplb: - self.eplb_updator.forward_end() + dummy_run = True + self.eplb_updator.forward_end(dummy_run) return hidden_states def profile_run(self) -> None: From edb38e427184701c1f561d5f5d9c6b62e450fa42 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Sun, 22 Jun 2025 17:27:47 +0800 Subject: [PATCH 134/153] fix bug in compute moe load --- vllm_ascend/eplb/eplb_updator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 2722a2da98..e73d5af3ac 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -131,7 +131,7 @@ def forward_before(self): self.eplb_loader.asyn_expert_weight_transfer(self.reqs) def forward_end(self, dummy_run=False): - self.adaptor.get_rank_expert_workload(self.num_moe_layers) + self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run) if not self.update_in_flight and self.get_update_iteration(): moe_load = self.compute_and_set_moe_load(dummy_run) self.wakeup_eplb_worker() @@ -144,8 +144,8 @@ def forward_end(self, dummy_run=False): self.eplb_loader.update_expert_map_and_weight(self.reqs, self.redundant_enable) - def compute_and_set_moe_load(self): - local_load = self.adaptor.get_rank_expert_workload(self.num_moe_layers) + def compute_and_set_moe_load(self,dummy_run=False): + local_load = self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run) self._gather_buffer = None if dist.is_initialized(): self.world_size = dist.get_world_size() From 8b31e790eda708fbce0d492006b4dfcf7a05556f Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Sun, 22 Jun 2025 18:11:53 +0800 Subject: [PATCH 135/153] fix bugs in forward_end --- vllm_ascend/eplb/eplb_updator.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index e73d5af3ac..147d6182bc 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -130,14 +130,17 @@ def forward_before(self): self.reqs = [] self.eplb_loader.asyn_expert_weight_transfer(self.reqs) - def forward_end(self, dummy_run=False): +def forward_end(self,dummy_run=False): self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run) - if not self.update_in_flight and self.get_update_iteration(): - moe_load = self.compute_and_set_moe_load(dummy_run) - self.wakeup_eplb_worker() - self.update_in_flight = True - self.wait_worker_iterations = 0 - self.weight_loading = False + if not self.update_in_flight: + load_gather_iteration, update_iteration = self.get_update_iteration() + if load_gather_iteration: + moe_load = self.compute_and_set_moe_load(dummy_run) + if update_iteration: + self.wakeup_eplb_worker() + self.update_in_flight = True + self.wait_worker_iterations = 0 + self.weight_loading = False if self.update_in_flight: self.wait_worker_iterations = self.wait_worker_iterations + 1 From 53e89496acac5c7e7ea4825b6029ef4977dfbcc4 Mon Sep 17 00:00:00 2001 From: lt Date: Mon, 23 Jun 2025 10:21:15 +0800 Subject: [PATCH 136/153] fix some bug --- vllm_ascend/eplb/eplb_updator.py | 4 +--- vllm_ascend/worker/worker_v1.py | 6 ------ 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index b881552738..6e5fd7d762 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -1,4 +1,3 @@ -# # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -76,7 +75,7 @@ def init_eplb(self, expert_map_path): shared_dict = self.shared_dict, planner_q = self.planner_block_queue, block_update_q = self.block_update_queue, - redundant_enable = self.redundant_enable, + redundant_enable = self.redundant_enable, policy_type = 6, enable_d2d = True ) @@ -104,7 +103,6 @@ def wakeup_eplb_worker(self): self.planner_block_queue.put(1) def forward_before(self): - # Batch after eplb process being triggered, get update info provided by eplb process if self.update_in_flight and self.weight_update_counter == 0 and self.wait_worker_iterations == self.num_wait_worker_iterations: self.wait_worker_iterations = 0 diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 5aa322a148..197c2845df 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -178,11 +178,6 @@ def execute_model( output = self.model_runner.execute_model(scheduler_output) return output if self.is_driver_worker else None - def get_expert_load(self) -> str: - """ todo 一共几个worker""" - moe_load = self.model_runner.do_get_expert_load() - return moe_load - def load_model(self) -> None: if self.vllm_config.model_config.enable_sleep_mode: allocator = CaMemAllocator.get_instance() @@ -215,7 +210,6 @@ def compile_or_warm_up_model(self) -> None: set_random_seed(self.model_config.seed) def get_expert_load(self) -> str: - """ todo 一共几个worker""" moe_load = self.model_runner.do_get_expert_load() return moe_load From 98b93834bf985786d9cdbcf74d1a1b78b9acf141 Mon Sep 17 00:00:00 2001 From: wanghanqingLYT Date: Mon, 23 Jun 2025 14:53:53 +0800 Subject: [PATCH 137/153] fix precision by fix a wrong branch condition in w8a8_dynamic.py --- vllm_ascend/quantization/w8a8_dynamic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 88f6890d50..a1793977dd 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -678,7 +678,7 @@ def apply( global_redundant_expert_num=global_redundant_expert_num, shared_experts=shared_experts, **kwargs), topk_ids - elif self.torchair_graph_enabled or self.ep_group.world_size == 1: + elif fused_moe_state == FusedMoEState.AllGather: return fused_experts(hidden_states=x, w1=layer.w13_weight, w1_scale=layer.w13_weight_scale, From 45766f672f19c2e3ee8355bda8a8f53274b850ea Mon Sep 17 00:00:00 2001 From: lt Date: Mon, 23 Jun 2025 15:03:21 +0800 Subject: [PATCH 138/153] fix code format alignment --- vllm_ascend/eplb/eplb_updator.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 147d6182bc..97599cb6a1 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -130,22 +130,22 @@ def forward_before(self): self.reqs = [] self.eplb_loader.asyn_expert_weight_transfer(self.reqs) -def forward_end(self,dummy_run=False): - self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run) - if not self.update_in_flight: - load_gather_iteration, update_iteration = self.get_update_iteration() - if load_gather_iteration: - moe_load = self.compute_and_set_moe_load(dummy_run) - if update_iteration: - self.wakeup_eplb_worker() - self.update_in_flight = True - self.wait_worker_iterations = 0 - self.weight_loading = False - - if self.update_in_flight: - self.wait_worker_iterations = self.wait_worker_iterations + 1 - - self.eplb_loader.update_expert_map_and_weight(self.reqs, self.redundant_enable) + def forward_end(self,dummy_run=False): + self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run) + if not self.update_in_flight: + load_gather_iteration, update_iteration = self.get_update_iteration() + if load_gather_iteration: + moe_load = self.compute_and_set_moe_load(dummy_run) + if update_iteration: + self.wakeup_eplb_worker() + self.update_in_flight = True + self.wait_worker_iterations = 0 + self.weight_loading = False + + if self.update_in_flight: + self.wait_worker_iterations = self.wait_worker_iterations + 1 + + self.eplb_loader.update_expert_map_and_weight(self.reqs, self.redundant_enable) def compute_and_set_moe_load(self,dummy_run=False): local_load = self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run) From 6b36fafc48ed35df62129d3cc2b39d2c1fda86ca Mon Sep 17 00:00:00 2001 From: lt Date: Mon, 23 Jun 2025 15:06:25 +0800 Subject: [PATCH 139/153] update format --- vllm_ascend/eplb/eplb_updator.py | 44 +++++++++++++++++--------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 6e5fd7d762..ff9bbd9fbd 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -1,3 +1,4 @@ +# # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -22,6 +23,7 @@ from vllm_ascend.eplb.core.worker.eplb_worker import EplbProcess from vllm_ascend.eplb.core.loader.device_transfer_loader import D2DExpertWeightLoader + class EplbUpdator: def __init__(self, expert_map_path): @@ -42,7 +44,7 @@ def init_eplb(self, expert_map_path): if not envs.VLLM_ALLOW_EXPERT_LOAD_COLLECTING: self.num_expert_load_gather = self.num_iterations except Exception as e: - self.num_expert_load_gather = self.num_iterations + self.num_expert_load_gather = self.num_iterations self.weight_update_counter = 0 self.expert_map_initialized = False @@ -72,19 +74,18 @@ def init_eplb(self, expert_map_path): }) self.eplb = EplbProcess( - shared_dict = self.shared_dict, - planner_q = self.planner_block_queue, - block_update_q = self.block_update_queue, - redundant_enable = self.redundant_enable, - policy_type = 6, - enable_d2d = True + shared_dict=self.shared_dict, + planner_q=self.planner_block_queue, + block_update_q=self.block_update_queue, + redundant_enable=self.redundant_enable, + policy_type=6, + enable_d2d=True ) self.eplb_process = self.eplb._launch_process() logger.info(f"[ModelRunner] Launched EPLB process (pid={self.eplb_process.pid})") - def get_update_iteration(self): self.cur_iterations = self.cur_iterations + 1 load_gather_iteration = self.cur_iterations % self.num_expert_load_gather == 0 if not self.gate_eplb else self.cur_iterations == self.num_iterations @@ -94,7 +95,8 @@ def get_update_iteration(self): def get_init_expert_map(self): try: if not self.expert_map_initialized: - self.shared_dict["expert_maps"] = self.adaptor.get_init_expert_map_from_file(self.num_moe_layers, self.expert_map_path) + self.shared_dict["expert_maps"] = self.adaptor.get_init_expert_map_from_file(self.num_moe_layers, + self.expert_map_path) self.expert_map_initialized = True except Exception as e: logger.warning(f"[ModelRunner] Failed to wake EPLB process: {e}", exc_info=True) @@ -103,6 +105,7 @@ def wakeup_eplb_worker(self): self.planner_block_queue.put(1) def forward_before(self): + # Batch after eplb process being triggered, get update info provided by eplb process if self.update_in_flight and self.weight_update_counter == 0 and self.wait_worker_iterations == self.num_wait_worker_iterations: self.wait_worker_iterations = 0 @@ -111,14 +114,16 @@ def forward_before(self): self.weight_loading = True if self.update_in_flight and self.weight_loading and self.weight_update_counter < self.num_moe_layers: - (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(0) + (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop( + 0) rank_id = torch.distributed.get_rank() self.eplb_loader.set_log2phy_map(log2phy_map) expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] - #logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") + # logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") self.eplb_loader.generate_expert_d2d_transfer_task(expert_send_info_this_rank, - expert_recv_info_this_rank, updated_expert_map, layer_id + 3) + expert_recv_info_this_rank, updated_expert_map, + layer_id + 3) self.weight_update_counter += 1 if self.weight_update_counter == self.num_moe_layers: self.weight_update_counter = 0 @@ -129,8 +134,8 @@ def forward_before(self): self.reqs = [] self.eplb_loader.asyn_expert_weight_transfer(self.reqs) - def forward_end(self,dummy_run=False): - self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run) + def forward_end(self, dummy_run=False): + self.adaptor.get_rank_expert_workload(self.num_moe_layers, dummy_run) if not self.update_in_flight: load_gather_iteration, update_iteration = self.get_update_iteration() if load_gather_iteration: @@ -146,8 +151,8 @@ def forward_end(self,dummy_run=False): self.eplb_loader.update_expert_map_and_weight(self.reqs, self.redundant_enable) - def compute_and_set_moe_load(self,dummy_run=False): - local_load = self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run) + def compute_and_set_moe_load(self, dummy_run=False): + local_load = self.adaptor.get_rank_expert_workload(self.num_moe_layers, dummy_run) self._gather_buffer = None if dist.is_initialized(): self.world_size = dist.get_world_size() @@ -192,7 +197,7 @@ def warm_up_eplb(self): continue comm_op_list.append( dist.P2POp(dist.irecv, src_tensor, src_rank) - ) + ) if comm_op_list: reqs = dist.batch_isend_irecv(comm_op_list) @@ -205,7 +210,7 @@ def unpack_update_batch(self, packed_update_info): """ send_all, recv_all, stacked_maps, stacked_log2phy, layer_id_tensor = packed_update_info - maps = stacked_maps.unbind(0) + maps = stacked_maps.unbind(0) layer_ids = layer_id_tensor.tolist() if self.redundant_enable: @@ -217,7 +222,7 @@ def unpack_update_batch(self, packed_update_info): _send = send_all _recv = recv_all _maps = maps - _l2p = log2phy_list + _l2p = log2phy_list _lids = layer_ids recovered = [ @@ -249,7 +254,6 @@ def update_expert_load_statistical_period(self, num_expert_load_gather: int, num self.num_iterations = num_iterations logger.info(f" update {self.num_expert_load_gather=}, {self.num_iterations} success...") - def shutdown(self): """ Clean up the EPLB process. From 1a067a3f39f105b2ae278637f3d201997005891c Mon Sep 17 00:00:00 2001 From: wanghanqingLYT Date: Mon, 23 Jun 2025 15:57:30 +0800 Subject: [PATCH 140/153] fix incident for function forward_end in eplb_updator.py --- vllm_ascend/eplb/eplb_updator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 147d6182bc..d514172cb9 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -130,7 +130,7 @@ def forward_before(self): self.reqs = [] self.eplb_loader.asyn_expert_weight_transfer(self.reqs) -def forward_end(self,dummy_run=False): + def forward_end(self,dummy_run=False): self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run) if not self.update_in_flight: load_gather_iteration, update_iteration = self.get_update_iteration() From 9c329ed01508730c7e6f8c87e3180f6f1994d7ee Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Tue, 24 Jun 2025 09:29:04 +0800 Subject: [PATCH 141/153] optimize calculate moe load --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 53 ++++++++++++------------ vllm_ascend/eplb/eplb_updator.py | 10 ++--- vllm_ascend/models/deepseek_v2.py | 7 ++++ 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 9d4db1fb80..90ff9e65ed 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -38,7 +38,7 @@ def __init__(self, model, **args): self.num_moe_layers = self.model.config.num_hidden_layers - self.num_dense_layers self.global_expert_num = self.model.config.n_routed_experts - + # TODO: init self.expert_weight_names depending on different model types, only deepseek v3 w8a8 is supported here self.expert_weight_names = ["w13_weight", "w2_weight", "w13_weight_scale", "w13_weight_offset", "w2_weight_scale", "w2_weight_offset"] @@ -62,6 +62,8 @@ def __init__(self, model, **args): self.log2phy_map_per_layer[self.num_dense_layers + layer_idx] =\ self.model.get_log2phy_map(self.num_dense_layers + layer_idx) + self.all_topk_ids = [] + def init_buffer_tensor(self, num_buffer_tensor): for name in self.expert_weight_names: complete_name = "model.layers." + str(self.num_dense_layers) + ".mlp.experts." + name @@ -82,39 +84,36 @@ def init_expert_param_per_layer(self): for name in self.expert_weight_names] ) - def get_rank_expert_workload( - self, - num_moe_layers: int, - dummy_run = False - ) -> torch.Tensor: - - all_topk_ids = [self.model.get_topk_ids(i) for i in range(num_moe_layers)] - stacked = torch.stack(all_topk_ids, dim=0) - L, B, K = stacked.shape - N = B * K - device = stacked.device - G = self.global_expert_num + def collect_topk_ids(self, dummy_run=False): + if dummy_run: + return + self.all_topk_ids.append(self.model.get_all_topk_ids(self.num_moe_layers)) - if not hasattr(self, "cum_moe_load") or self.cum_moe_load is None: - self.cum_moe_load = torch.zeros((L, G), - dtype=torch.int64, - device=device) + def get_rank_expert_workload(self) -> torch.Tensor: - if dummy_run: - return self.cum_moe_load + device = self.all_topk_ids[0][0].device + flat_list_per_layer = [[] for _ in range(self.num_moe_layers)] - ids1d = stacked.view(-1).to(torch.int64) + for period_data in self.all_topk_ids: + for l in range(self.num_moe_layers): + t = period_data[l] + flat_list_per_layer[l].append(t.reshape(-1)) - row_idx = torch.arange(L, device=device).repeat_interleave(N) + index_2d = torch.nn.utils.rnn.pad_sequence( + [torch.cat(flat_list_per_layer[l]) for l in range(self.num_moe_layers)], + batch_first=True, padding_value=-1 + ).to(device) - combined = row_idx * G + ids1d + mask = index_2d != -1 + index_2d = index_2d.masked_select(mask).reshape(self.num_moe_layers, -1) + src_2d = torch.ones_like(index_2d, dtype=torch.int64) - counts = torch.bincount(combined, minlength=L * G) - workload = counts.view(L, G) + moe_load = torch.zeros((self.num_moe_layers), self.global_expert_num, + dtype=torch.int64, device=device) + moe_load.scatter_add_(dim=1, index=index_2d, src=src_2d) - self.cum_moe_load.add_(workload) - - return self.cum_moe_load + self.all_topk_ids = [] + return moe_load def get_init_expert_map(self, num_moe_layers): expert_map = self.model.get_all_expert_map(num_moe_layers) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index d514172cb9..a399f294e5 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -57,7 +57,7 @@ def init_eplb(self, expert_map_path): self.cur_iterations: torch.int64 = 0 self.wait_worker_iterations: torch.int64 = 0 - self.num_wait_worker_iterations: torch.int64 = 10 + self.num_wait_worker_iterations: torch.int64 = 20 self.planner_block_queue = Queue() self.block_update_queue = Queue(maxsize=1) @@ -131,11 +131,11 @@ def forward_before(self): self.eplb_loader.asyn_expert_weight_transfer(self.reqs) def forward_end(self,dummy_run=False): - self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run) + self.adaptor.collect_topk_ids(dummy_run) if not self.update_in_flight: load_gather_iteration, update_iteration = self.get_update_iteration() if load_gather_iteration: - moe_load = self.compute_and_set_moe_load(dummy_run) + moe_load = self.compute_and_set_moe_load() if update_iteration: self.wakeup_eplb_worker() self.update_in_flight = True @@ -148,7 +148,7 @@ def forward_end(self,dummy_run=False): self.eplb_loader.update_expert_map_and_weight(self.reqs, self.redundant_enable) def compute_and_set_moe_load(self,dummy_run=False): - local_load = self.adaptor.get_rank_expert_workload(self.num_moe_layers,dummy_run) + local_load = self.adaptor.get_rank_expert_workload() self._gather_buffer = None if dist.is_initialized(): self.world_size = dist.get_world_size() @@ -173,7 +173,7 @@ def compute_and_set_moe_load(self,dummy_run=False): def warm_up_eplb(self): self.get_init_expert_map() - + self.adaptor.collect_topk_ids(dummy_run=False) self.compute_and_set_moe_load() src_tensor = torch.empty((1,), device=self.device) diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 93755cef4d..b70a239253 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -772,5 +772,12 @@ def get_all_expert_map(self,num_moe_layers): def get_topk_ids(self,layer_id): return self.model.layers[layer_id+3].mlp.experts.topk_ids + def get_all_topk_ids(self,num_moe_layers): + all_topk_id = [] + for layer_id in range(num_moe_layers): + load_tensor = self.get_topk_ids(layer_id) + all_topk_id.append(load_tensor) + return all_topk_id + class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM): pass From 4980f2c8922b6d20e2addbda00ebf615259fd21a Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Tue, 24 Jun 2025 17:43:17 +0800 Subject: [PATCH 142/153] fix bug in moe load & add expert load to josn --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 45 ++++++--------------- vllm_ascend/eplb/core/worker/eplb_worker.py | 1 + vllm_ascend/eplb/eplb_updator.py | 44 ++++++++++++-------- 3 files changed, 41 insertions(+), 49 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 90ff9e65ed..9f87d75750 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -90,8 +90,16 @@ def collect_topk_ids(self, dummy_run=False): self.all_topk_ids.append(self.model.get_all_topk_ids(self.num_moe_layers)) def get_rank_expert_workload(self) -> torch.Tensor: - device = self.all_topk_ids[0][0].device + if not hasattr(self, "moe_load"): + self.moe_load = torch.zeros( + (self.num_moe_layers), self.global_expert_num, + dtype=torch.int64, + device=self.all_topk_ids[0][0].device, + ) + else: + self.moe_load.zero_() + # pass flat_list_per_layer = [[] for _ in range(self.num_moe_layers)] for period_data in self.all_topk_ids: @@ -108,12 +116,11 @@ def get_rank_expert_workload(self) -> torch.Tensor: index_2d = index_2d.masked_select(mask).reshape(self.num_moe_layers, -1) src_2d = torch.ones_like(index_2d, dtype=torch.int64) - moe_load = torch.zeros((self.num_moe_layers), self.global_expert_num, - dtype=torch.int64, device=device) - moe_load.scatter_add_(dim=1, index=index_2d, src=src_2d) + self.moe_load.scatter_add_(dim=1, index=index_2d, src=src_2d) - self.all_topk_ids = [] - return moe_load + if self.all_topk_ids: + self.all_topk_ids[:] = self.all_topk_ids[-1:] + return self.moe_load def get_init_expert_map(self, num_moe_layers): expert_map = self.model.get_all_expert_map(num_moe_layers) @@ -135,32 +142,6 @@ def get_init_expert_map(self, num_moe_layers): return all_expert_maps - def local2global(self, - placement_local: torch.Tensor - ) -> torch.Tensor: - - L, G, E_local = placement_local.shape - device = placement_local.device - - max_id = torch.max(placement_local) - E_global = (max_id + 1).item() if max_id >= 0 else 0 - - if E_global == 0: - return torch.empty((L, G, 0), dtype=torch.long, device=device) - - placement_global = torch.full((L, G, E_global), - fill_value=-1, - dtype=torch.long, - device=device) - - valid = placement_local >= 0 - l_idx, g_idx, slot_idx = valid.nonzero(as_tuple=True) - gid_idx = placement_local[l_idx, g_idx, slot_idx] - - placement_global[l_idx, g_idx, gid_idx] = slot_idx - - return placement_global - def get_init_expert_map_from_file(self, num_moe_layers, expert_map_path): try: diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index f63996e4ed..88831739e7 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -62,6 +62,7 @@ def do_update(self): #根据负载信息,获取更新后的专家表 load_info, old_placement = self.global2local(load_info, self.old_expert_maps, self.num_local_experts) + self.shared_dict["load_info"] = load_info changed, priority, new_placement = self.calculate_rebalance_experts(load_info, old_placement) if not torch.is_tensor(new_placement): diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index a399f294e5..9e34cb4565 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -14,7 +14,9 @@ # limitations under the License. # This file is a part of the vllm-ascend project. # + import torch +from typing import Dict, List import torch.distributed as dist import vllm.envs as envs from multiprocessing import Queue, Manager @@ -22,6 +24,7 @@ from vllm.logger import logger from vllm_ascend.eplb.core.worker.eplb_worker import EplbProcess from vllm_ascend.eplb.core.loader.device_transfer_loader import D2DExpertWeightLoader +from vllm_ascend.eplb.tool.eplb_utils import ExpertMapUtils class EplbUpdator: @@ -32,6 +35,7 @@ def set_adaptor(self, adaptor): self.adaptor = adaptor self.eplb_loader = D2DExpertWeightLoader(eplb_adaptor=self.adaptor) self.num_moe_layers = self.adaptor.num_moe_layers + self.global_expert_num = self.adaptor.global_expert_num def init_eplb(self, expert_map_path): self.num_expert_load_gather = 10 @@ -69,7 +73,9 @@ def init_eplb(self, expert_map_path): # 热度负载信息 [num_layers, world_size, num_experts] "moe_load": None, # 所有的专家表[num_layers, world_size, num_experts] - "expert_maps": None + "expert_maps": None, + # 热度负载信息 [num_layers, world_size, local_num_experts] + "load_info": None, }) self.eplb = EplbProcess( @@ -125,11 +131,11 @@ def forward_before(self): self.weight_update_counter = 0 self.update_in_flight = False self.update_info_all = [] - # set asynchronous stream for d2d expert weight update self.reqs = [] self.eplb_loader.asyn_expert_weight_transfer(self.reqs) + def forward_end(self,dummy_run=False): self.adaptor.collect_topk_ids(dummy_run) if not self.update_in_flight: @@ -149,6 +155,7 @@ def forward_end(self,dummy_run=False): def compute_and_set_moe_load(self,dummy_run=False): local_load = self.adaptor.get_rank_expert_workload() + self._gather_buffer = None if dist.is_initialized(): self.world_size = dist.get_world_size() @@ -229,20 +236,24 @@ def unpack_update_batch(self, packed_update_info): return recovered def get_expert_load(self) -> str: - - # todo wjh 给到返回值 - # return self.shared_dict['moe_load'] - # mock json_str - experts_load = ('{\"expert_load\":[' - '{\"ip\":\"141.xxx.xxx.181\",' - '\"node_0\":' - '{\"card_0\":' - '[{\"layer_4\":{\"expert_0\":3,\"expert_2\":1}},{\"layer_5\":{\"expert_0\":3,\"expert_2\":1}}],' - '\"card_1\":[{\"layer_4\":{\"expert_1\":3,\"expert_3\":1},\"layer_5\":{\"expert_0\":3,\"' - 'expert_2\":1}}]}},{\"ip\":\"141.xxx.xxx.177\",\"node_0\":{\"card_0\":[{\"layer_4\":' - '{\"expert_0\":3,\"expert_2\":1}},{\"layer_5\":{\"expert_0\":3,\"expert_2\":1}}],' - '\"card_1\":[{\"layer_4\":{\"expert_1\":3,\"expert_3\":1}}]}}]}') - return experts_load + + load_info = self.shared_dict["load_info"] # Tensor [L, W, local_experts_num] + L, W, _ = load_info.shape + + expert_load: Dict[str, List[dict]] = {} + for c in range(W): + layers: List[dict] = [] + for l in range(L): + counts_1d = load_info[l, c] + + layer_val = { + f"expert_{e}": int(v) + for e, v in enumerate(counts_1d.tolist()) + } + layers.append({f"layer_{l}": layer_val}) + expert_load[f"card_{c}"] = layers + + return {"expert_load": expert_load} def update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): logger.info(f" start update {self.num_expert_load_gather=}, {self.num_iterations}...") @@ -250,7 +261,6 @@ def update_expert_load_statistical_period(self, num_expert_load_gather: int, num self.num_iterations = num_iterations logger.info(f" update {self.num_expert_load_gather=}, {self.num_iterations} success...") - def shutdown(self): """ Clean up the EPLB process. From 9d9c93acae7eed694d8fb93adaad3ac34a34ddc4 Mon Sep 17 00:00:00 2001 From: lt Date: Tue, 24 Jun 2025 20:07:43 +0800 Subject: [PATCH 143/153] update get_expert_load return type --- vllm_ascend/worker/model_runner_v1.py | 2 +- vllm_ascend/worker/worker_v1.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 70fdea6f02..e7bf24661d 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1587,7 +1587,7 @@ def profile_run(self) -> None: self.encoder_cache.clear() gc.collect() - def do_get_expert_load(self) -> str: + def do_get_expert_load(self) -> torch.Tensor: return self.eplb_updator.get_expert_load() def do_update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index a5a6dcf93b..89504b34bd 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -209,7 +209,7 @@ def compile_or_warm_up_model(self) -> None: # the model initialization and profiling. set_random_seed(self.model_config.seed) - def get_expert_load(self) -> str: + def get_expert_load(self) -> torch.Tensor: """ todo 一共几个worker""" moe_load = self.model_runner.do_get_expert_load() return moe_load From 162d106c68d6c13c0d47273245e735c22a28fbd0 Mon Sep 17 00:00:00 2001 From: wanghanqingLYT Date: Wed, 25 Jun 2025 14:51:42 +0800 Subject: [PATCH 144/153] fix bug when running benchmark by move forward_before behind return of empty batch in execute_model --- vllm_ascend/worker/model_runner_v1.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 70fdea6f02..53f1f79f94 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1231,15 +1231,16 @@ def execute_model( intermediate_tensors: Optional[IntermediateTensors] = None, ) -> Union[ModelRunnerOutput, torch.Tensor]: - if self.dynamic_eplb: - self.eplb_updator.forward_before() - with ProfileExecuteDuration().capture_async( "prepare input and forward"): self._update_states(scheduler_output) if not scheduler_output.total_num_scheduled_tokens: # Return empty ModelRunnerOuptut if there's no work to do. return EMPTY_MODEL_RUNNER_OUTPUT + + if self.dynamic_eplb: + self.eplb_updator.forward_before() + (attn_metadata, hidden_states, spec_decode_metadata, positions, num_scheduled_tokens, sample_indices) = (self._process_reqs(scheduler_output, From 1f0b9804e8e2be323f42086606da821812c7d48c Mon Sep 17 00:00:00 2001 From: NPUDev Date: Thu, 26 Jun 2025 16:45:04 +0800 Subject: [PATCH 145/153] fix SwiftBalancer eplb algo --- vllm_ascend/eplb/core/policy/dynamic_ep_v2.py | 105 ++++++++++-------- 1 file changed, 56 insertions(+), 49 deletions(-) diff --git a/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py b/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py index 783e04cbf0..775cf5f71d 100644 --- a/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py +++ b/vllm_ascend/eplb/core/policy/dynamic_ep_v2.py @@ -383,7 +383,30 @@ def non_redundant_expert_information(origin_deployment, updated_weights, num_rad return device_assignments, device_weights, device_loads, device_counts @staticmethod - def distribute_redun_experts(device_assignments, device_weights, device_loads, device_counts, redundant_expert_list, + def recomputing_weight(layer_workloads, device_assignments, device_weights, device_loads): + # 统计专家出现次数 + num_all_experts = [0] * len(layer_workloads) + num_devices = len(device_assignments) + for device_id in range(num_devices): + num_expert_per_npu = len(device_assignments[device_id]) + for idx in range(num_expert_per_npu): + num_all_experts[idx] += device_assignments[device_id][idx] + + for device_id in range(num_devices): + num_expert_per_npu = len(device_weights[device_id]) + total_weight = 0.0 + for idx in range(num_expert_per_npu): + expert_id = device_assignments[device_id][idx] + if num_all_experts[expert_id] == 0: + print("Error: Division by zero") + device_weights[device_id][idx] = layer_workloads[expert_id] / num_all_experts[expert_id] + total_weight += device_weights[device_id][idx] + device_loads[device_id] = total_weight + + return device_weights, device_loads + + @staticmethod + def distribute_redun_experts(self, layer_workloads, device_assignments, device_weights, device_loads, device_counts, redundant_expert_list, items_per_device, expert_form_device, num_experts): num_devices = len(device_assignments) @@ -411,18 +434,16 @@ def distribute_redun_experts(device_assignments, device_weights, device_loads, d communication_box_index = expert_form_device[expert_id] com_between_devices[candidate][communication_box_index] = expert_id # 极端情况下存在冗余专家没装箱 导致箱子有空位 随机填入专家 待优化 + flag = False for dev_id in range(num_devices): # 检查容量限制 if device_counts[dev_id] < items_per_device: # 遍历合适的专家 for expert_id in range(num_experts): if expert_id not in device_assignments[dev_id]: - # 找到对应权重 - weight = 0 - for i in range(num_devices): - for j in range(len(device_assignments[i])): - if expert_id == device_assignments[i][j]: - weight = device_weights[i][j] + flag = True + # 随机初始化一个权重 + weight = 0.0 # 和该专家相关的卡权重发生变化 待修改 device_assignments[dev_id].insert(0, expert_id) device_weights[dev_id].insert(0, weight) @@ -432,12 +453,14 @@ def distribute_redun_experts(device_assignments, device_weights, device_loads, d communication_box_index = expert_form_device[expert_id] com_between_devices[dev_id][communication_box_index] = expert_id break - #todo 重新生成权重 + + if flag: + device_weights, device_loads = self.recomputing_weight(layer_workloads, device_assignments, device_weights, device_loads) return device_assignments, device_weights, device_loads, device_counts, com_between_devices @staticmethod - def redundancy_again(self, origin_weights, num_redundant_experts, origin_deployment, expert_form_device, num_node, + def redundancy_again(self, layer_workloads, origin_weights, num_redundant_experts, origin_deployment, expert_form_device, num_node, is_node_redundant): # 每张卡上专家数量 @@ -461,6 +484,8 @@ def redundancy_again(self, origin_weights, num_redundant_experts, origin_deploym # 新计算的冗余专家进行分配 device_assignments, device_weights, device_loads, device_counts, com_between_devices = self.distribute_redun_experts( + self, + layer_workloads, device_assignments, device_weights, device_loads, @@ -554,6 +579,7 @@ def redundant_expert_deployment(self, layer_workloads, original_deployment, expe cur_device_assignments, cur_device_weights, cur_device_loads, cur_device_counts, cur_com_between_devices = self.redundancy_again( self, + layer_workloads, cur_node_weights, per_node_redun_expert_num, cur_original_deployment, @@ -569,6 +595,7 @@ def redundant_expert_deployment(self, layer_workloads, original_deployment, expe else: device_assignments, device_weights, device_loads, device_counts, com_between_devices = self.redundancy_again( self, + layer_workloads, weights, redundancy_expert_num, original_deployment, @@ -583,7 +610,7 @@ def redundant_expert_deployment(self, layer_workloads, original_deployment, expe @staticmethod def two_device_exchange_experts(cur_device_result, exchange_device_result, cur_exchanged_expert_id, - next_exchanged_expert_id, ave_workload, increment, num_redundancy_expert): + next_exchanged_expert_id, ave_workload, increment, num_redundancy_expert, cur_org_placement, next_org_placement): cur_device_weight = cur_device_result['expert_weights'] next_device_weight = exchange_device_result['expert_weights'] @@ -609,7 +636,8 @@ def two_device_exchange_experts(cur_device_result, exchange_device_result, cur_e continue # 交换专家限制卡内专家不同 change_flag = True - if cur_device_expert_id[index] in next_device_expert_id or next_device_expert_id[next_index] in cur_device_expert_id: + if ((cur_device_expert_id[index] in next_device_expert_id or next_device_expert_id[next_index] in cur_device_expert_id) or + (cur_org_placement[0] == next_device_expert_id[next_index] or next_org_placement[0] == cur_device_expert_id[index])): change_flag = False # 选择的专家不能是参与过交换的 if (cur_device_expert_id[index] not in cur_exchanged_expert_id) and ( @@ -627,8 +655,7 @@ def two_device_exchange_experts(cur_device_result, exchange_device_result, cur_e @staticmethod def expert_exchange_between_devices(self, ave_workload, increment, cur_layer_result, com_between_devices, num_redundancy_expert, - node_idx=0, - per_node_device_num=0, is_node_redundant=False): + org_placement_table, node_idx=0, per_node_device_num=0, is_node_redundant=False): if is_node_redundant: # 拿出当前节点内设备的信息 @@ -677,7 +704,9 @@ def expert_exchange_between_devices(self, ave_workload, increment, cur_layer_res next_exchanged_expert_id, ave_workload, increment, - num_redundancy_expert) + num_redundancy_expert, + org_placement_table[max_weight_device_id], + org_placement_table[min_weight_device_id]) # 有符合条件的专家进行交换 if cur_exchange_index != -1: @@ -700,7 +729,7 @@ def expert_exchange_between_devices(self, ave_workload, increment, cur_layer_res @staticmethod def exchange_experts(self, layer_result, layer_com_between_devices, num_nodes, device_num, is_node_redundant, - ave_workload, increment, num_redundancy_expert): + ave_workload, increment, num_redundancy_expert, org_placement_table): global_deployment = [] @@ -709,9 +738,9 @@ def exchange_experts(self, layer_result, layer_com_between_devices, num_nodes, d for node_idx in range(num_nodes): self.expert_exchange_between_devices(self, ave_workload, increment, layer_result, layer_com_between_devices, num_redundancy_expert, - node_idx, per_node_device_num, is_node_redundant) + org_placement_table, node_idx, per_node_device_num, is_node_redundant) else: - self.expert_exchange_between_devices(self, ave_workload, increment, layer_result, layer_com_between_devices, num_redundancy_expert) + self.expert_exchange_between_devices(self, ave_workload, increment, layer_result, layer_com_between_devices, num_redundancy_expert, org_placement_table) max_workload = 0 for box in layer_result: @@ -734,14 +763,15 @@ def count_elements(self, lst): return count def rebalance_experts(self, current_expert_table, expert_workload): - + # 输入:当前专家部署信息和对应的负载信息,形状为layer_num, num_npus, experts_per_npu info = DynamicTable() - info.workload_table = np.array(expert_workload) - info.placement_table = np.array(current_expert_table) + info.workload_table = expert_workload.numpy() + info.placement_table = current_expert_table.numpy() layer_num, num_npus, experts_per_npu = info.workload_table.shape expert_ids, counts = np.unique(info.placement_table[0], return_counts=True) num_redundancy_expert = self.get_redundant_num(num_npus, counts) num_original_expert = len(expert_ids) + # 负载信息转化为 58 * 256 layer_workloads = self.add_redundant(info.placement_table, info.workload_table, num_original_expert) max_heat_per_layer_before = self.calculate_max_heat_per_layer(info.workload_table, layer_num) npu_heat_all_origin = sum(max_heat_per_layer_before) @@ -764,50 +794,31 @@ def rebalance_experts(self, current_expert_table, expert_workload): # 每个卡部署的专家数量 一个冗余专家 global_deployment = [[[] for _ in range(num_npus)] for _ in range(layer_num)] # 统计更换数据集后的初始58层不均衡度 - layer_initial_imbalance = self.calculate_initial_imbalance(current_expert_table, layer_workloads) + layer_initial_imbalance = self.calculate_initial_imbalance(info.placement_table, layer_workloads) # 遍历获得每一层的放置策略,考虑计算均衡 max_heat_per_layer_after = np.zeros([layer_num]) sum_num = 0 for layer in range(layer_num): # 不均衡度小于特定阈值不调整 if layer_initial_imbalance[layer] < 1.1: - global_deployment[layer] = current_expert_table[layer] + global_deployment[layer] = info.placement_table[layer] continue ave_workload = np.sum(layer_workloads[layer]) / num_npus - for device_id, device in enumerate(current_expert_table[layer]): + for device_id, device in enumerate(info.placement_table[layer]): for index, expert_id in enumerate(device): if index != 0: expert_from_device[layer][expert_id] = device_id # 调整冗余专家 result, max_workload, com_between_devices = self.redundant_expert_deployment(self, layer_workloads[layer], - current_expert_table[layer], + info.placement_table[layer], expert_from_device[layer], num_node, False) # 交换专家 global_deployment[layer], new_max_workload = self.exchange_experts(self, result, com_between_devices, num_node, num_npus, False, ave_workload, - 0.05, num_redundancy_expert) - - # To guarantee there is no expert movement inside a NPU - start_physical_idx = 1 if num_redundancy_expert else 0 - for rank in range(num_npus): - physical_expert = start_physical_idx - while physical_expert in range(start_physical_idx, experts_per_npu): - # skip the expert which is moved into this rank - if global_deployment[layer][rank][physical_expert] not in current_expert_table[layer, rank, :]: - physical_expert += 1 - continue - - if global_deployment[layer][rank][physical_expert] != current_expert_table[layer][rank][physical_expert]: - right_idx = np.where(current_expert_table[layer][rank] == global_deployment[layer][rank][physical_expert])[0][0] - # exchange expert with the expert on the right physical index - tempt = global_deployment[layer][rank][right_idx] - global_deployment[layer][rank][right_idx] = global_deployment[layer][rank][physical_expert] - global_deployment[layer][rank][physical_expert] = tempt - else: - physical_expert += 1 + 0.05, num_redundancy_expert, info.placement_table[layer]) for device_id in range(num_npus): com_between_devices[device_id] = {int(key): int(value) for key, value in @@ -828,8 +839,4 @@ def rebalance_experts(self, current_expert_table, expert_workload): if npu_heat_all_after < 0.95 * npu_heat_all_origin: change = 1 - return change, per_layer_priority, np.array(global_deployment).tolist() - - - - + return change, per_layer_priority, np.array(global_deployment).tolist() \ No newline at end of file From d018ec8c64eb97acc7e82523d779d8a75e00e813 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Fri, 27 Jun 2025 17:32:57 +0800 Subject: [PATCH 146/153] fix get_expert_load --- vllm_ascend/eplb/core/worker/eplb_worker.py | 3 +-- vllm_ascend/eplb/eplb_updator.py | 10 ++++--- vllm_ascend/eplb/tool/eplb_utils.py | 30 +++++++++++++++++++++ 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 88831739e7..bf569f147c 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -62,14 +62,13 @@ def do_update(self): #根据负载信息,获取更新后的专家表 load_info, old_placement = self.global2local(load_info, self.old_expert_maps, self.num_local_experts) - self.shared_dict["load_info"] = load_info changed, priority, new_placement = self.calculate_rebalance_experts(load_info, old_placement) if not torch.is_tensor(new_placement): new_placement = torch.tensor(new_placement) self.check_expert_placement(old_placement, new_placement) new_expert_maps = self.local2global(new_placement) - + self.update_expert_map(new_expert_maps) logger.debug(f"[EPLB Process new_map differs, performing D2D") update_info = self.compose_expert_update_info_bipartite(new_expert_maps, self.old_expert_maps)\ diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 9e34cb4565..4eb0567dac 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -74,8 +74,6 @@ def init_eplb(self, expert_map_path): "moe_load": None, # 所有的专家表[num_layers, world_size, num_experts] "expert_maps": None, - # 热度负载信息 [num_layers, world_size, local_num_experts] - "load_info": None, }) self.eplb = EplbProcess( @@ -235,9 +233,12 @@ def unpack_update_batch(self, packed_update_info): ] return recovered - def get_expert_load(self) -> str: + def get_expert_load(self): + expert_maps = self.shared_dict["expert_maps"] + moe_load = self.shared_dict["moe_load"] # Tensor [L, W, global_experts_num] + num_local_experts = expert_maps.max() + 1 + load_info, _ = ExpertMapUtils.global2local_load(moe_load, expert_maps, num_local_experts) - load_info = self.shared_dict["load_info"] # Tensor [L, W, local_experts_num] L, W, _ = load_info.shape expert_load: Dict[str, List[dict]] = {} @@ -253,6 +254,7 @@ def get_expert_load(self) -> str: layers.append({f"layer_{l}": layer_val}) expert_load[f"card_{c}"] = layers + print(expert_load) return {"expert_load": expert_load} def update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): diff --git a/vllm_ascend/eplb/tool/eplb_utils.py b/vllm_ascend/eplb/tool/eplb_utils.py index 4192fa9507..c385dafbae 100644 --- a/vllm_ascend/eplb/tool/eplb_utils.py +++ b/vllm_ascend/eplb/tool/eplb_utils.py @@ -83,3 +83,33 @@ def global2local(cls, pt_local[g_idx, slot_idx] = k_idx return pt_local + + @classmethod + def global2local_load(self, + workload: torch.Tensor, + placement: torch.Tensor, + E_local: int + ) -> tuple[torch.Tensor, torch.Tensor]: + + L, G, _ = placement.shape + device = placement.device + + wt_local = torch.full((L, G, E_local), + fill_value=-1, + dtype=workload.dtype, + device=device) + pt_local = torch.full((L, G, E_local), + fill_value=-1, + dtype=torch.long, + device=device) + + valid = placement >= 0 + l_idx, g_idx, k_idx = valid.nonzero(as_tuple=True) + + slot_idx = placement[l_idx, g_idx, k_idx] + values = workload[l_idx, g_idx, k_idx] + + wt_local[l_idx, g_idx, slot_idx] = values + pt_local[l_idx, g_idx, slot_idx] = k_idx + + return wt_local, pt_local \ No newline at end of file From 6a0a05e319c667369bdb70fbe4694e3d45f7db79 Mon Sep 17 00:00:00 2001 From: qmkakaxi Date: Fri, 27 Jun 2025 17:37:31 +0800 Subject: [PATCH 147/153] delete invaild print --- vllm_ascend/eplb/eplb_updator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 4eb0567dac..0773c2b4c0 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -254,7 +254,6 @@ def get_expert_load(self): layers.append({f"layer_{l}": layer_val}) expert_load[f"card_{c}"] = layers - print(expert_load) return {"expert_load": expert_load} def update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): From 1547810b4074f71934cf650dad61b9f5ce478ee8 Mon Sep 17 00:00:00 2001 From: lt Date: Fri, 27 Jun 2025 17:41:12 +0800 Subject: [PATCH 148/153] delete empty tensor judgement --- vllm_ascend/eplb/eplb_updator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 356357b641..9bd1b1af7d 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -237,8 +237,6 @@ def unpack_update_batch(self, packed_update_info): def get_expert_load(self) -> torch.Tensor: expert_maps = self.shared_dict["expert_maps"] moe_load = self.shared_dict["moe_load"] # Tensor [L, W, global_experts_num] - if not moe_load: - return None num_local_experts = expert_maps.max() + 1 load_info, _ = ExpertMapUtils.global2local_load(moe_load, expert_maps, num_local_experts) return load_info From 3465ad697b1a579ffb79e9e99a83010897275d14 Mon Sep 17 00:00:00 2001 From: lt Date: Sat, 28 Jun 2025 17:44:25 +0800 Subject: [PATCH 149/153] relocate the code from the worker_runner to the server side. --- vllm_ascend/eplb/eplb_updator.py | 6 +++--- vllm_ascend/eplb/tool/eplb_utils.py | 1 - vllm_ascend/worker/model_runner_v1.py | 2 +- vllm_ascend/worker/worker_v1.py | 6 ++---- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 9bd1b1af7d..2fef9619e6 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -234,12 +234,12 @@ def unpack_update_batch(self, packed_update_info): ] return recovered - def get_expert_load(self) -> torch.Tensor: + def get_expert_load(self) -> tuple: expert_maps = self.shared_dict["expert_maps"] moe_load = self.shared_dict["moe_load"] # Tensor [L, W, global_experts_num] num_local_experts = expert_maps.max() + 1 - load_info, _ = ExpertMapUtils.global2local_load(moe_load, expert_maps, num_local_experts) - return load_info + return moe_load, expert_maps, num_local_experts + def update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): logger.info(f" start update {self.num_expert_load_gather=}, {self.num_iterations}...") diff --git a/vllm_ascend/eplb/tool/eplb_utils.py b/vllm_ascend/eplb/tool/eplb_utils.py index c385dafbae..156f7a9b9d 100644 --- a/vllm_ascend/eplb/tool/eplb_utils.py +++ b/vllm_ascend/eplb/tool/eplb_utils.py @@ -90,7 +90,6 @@ def global2local_load(self, placement: torch.Tensor, E_local: int ) -> tuple[torch.Tensor, torch.Tensor]: - L, G, _ = placement.shape device = placement.device diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index a1a9069982..097eec3624 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1588,7 +1588,7 @@ def profile_run(self) -> None: self.encoder_cache.clear() gc.collect() - def do_get_expert_load(self) -> torch.Tensor: + def do_get_expert_load(self) -> tuple: return self.eplb_updator.get_expert_load() def do_update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index fcf6b657b1..48675ed341 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -209,10 +209,8 @@ def compile_or_warm_up_model(self) -> None: # the model initialization and profiling. set_random_seed(self.model_config.seed) - def get_expert_load(self) -> torch.Tensor: - moe_load = self.model_runner.do_get_expert_load() - return moe_load - + def get_expert_load(self) -> tuple: + return self.model_runner.do_get_expert_load() def update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): self.model_runner.do_update_expert_load_statistical_period(num_expert_load_gather, num_iterations) From ad5e7e161a7ddb5f56753fc0e4c60aead5abc7de Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Mon, 30 Jun 2025 10:23:17 +0800 Subject: [PATCH 150/153] collect moe load after dispatch --- vllm_ascend/eplb/adaptor/vllm_adaptor.py | 49 +++++---------------- vllm_ascend/eplb/core/worker/eplb_worker.py | 12 +---- vllm_ascend/eplb/eplb_updator.py | 21 +++++---- vllm_ascend/models/deepseek_v2.py | 27 +++++++----- vllm_ascend/ops/fused_moe.py | 16 +++++-- vllm_ascend/quantization/w8a8_dynamic.py | 15 ++++--- vllm_ascend/worker/model_runner_v1.py | 6 ++- 7 files changed, 63 insertions(+), 83 deletions(-) diff --git a/vllm_ascend/eplb/adaptor/vllm_adaptor.py b/vllm_ascend/eplb/adaptor/vllm_adaptor.py index 9f87d75750..585fcad7eb 100644 --- a/vllm_ascend/eplb/adaptor/vllm_adaptor.py +++ b/vllm_ascend/eplb/adaptor/vllm_adaptor.py @@ -84,42 +84,13 @@ def init_expert_param_per_layer(self): for name in self.expert_weight_names] ) - def collect_topk_ids(self, dummy_run=False): - if dummy_run: - return - self.all_topk_ids.append(self.model.get_all_topk_ids(self.num_moe_layers)) + # def collect_topk_ids(self, dummy_run=False): + # if dummy_run: + # return + # self.all_topk_ids.append(self.model.get_all_topk_ids(self.num_moe_layers)) def get_rank_expert_workload(self) -> torch.Tensor: - device = self.all_topk_ids[0][0].device - if not hasattr(self, "moe_load"): - self.moe_load = torch.zeros( - (self.num_moe_layers), self.global_expert_num, - dtype=torch.int64, - device=self.all_topk_ids[0][0].device, - ) - else: - self.moe_load.zero_() - # pass - flat_list_per_layer = [[] for _ in range(self.num_moe_layers)] - - for period_data in self.all_topk_ids: - for l in range(self.num_moe_layers): - t = period_data[l] - flat_list_per_layer[l].append(t.reshape(-1)) - - index_2d = torch.nn.utils.rnn.pad_sequence( - [torch.cat(flat_list_per_layer[l]) for l in range(self.num_moe_layers)], - batch_first=True, padding_value=-1 - ).to(device) - - mask = index_2d != -1 - index_2d = index_2d.masked_select(mask).reshape(self.num_moe_layers, -1) - src_2d = torch.ones_like(index_2d, dtype=torch.int64) - - self.moe_load.scatter_add_(dim=1, index=index_2d, src=src_2d) - - if self.all_topk_ids: - self.all_topk_ids[:] = self.all_topk_ids[-1:] + self.moe_load = self.model.get_all_moe_loads() return self.moe_load def get_init_expert_map(self, num_moe_layers): @@ -224,13 +195,13 @@ def determine_expert_map_all(self): for r in range(self.world_size): if r < self.world_size - 1: - start = r * local_num_experts - end = (r + 1) * local_num_experts - local_count = local_num_experts + start = r * local_num_experts + end = (r + 1) * local_num_experts + local_count = local_num_experts else: - start = r * local_num_experts + start = r * local_num_experts end = self.global_expert_num - local_count = self.global_expert_num - r * local_num_experts + local_count = self.global_expert_num - r * local_num_experts local_ids = torch.arange(local_count, dtype=torch.int32) expert_map_all[:, r, start:end] = local_ids.unsqueeze(0).expand(self.num_moe_layers, -1) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 88831739e7..21c21424b6 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -61,8 +61,7 @@ def do_update(self): return #根据负载信息,获取更新后的专家表 - load_info, old_placement = self.global2local(load_info, self.old_expert_maps, self.num_local_experts) - self.shared_dict["load_info"] = load_info + old_placement = self.global2local(self.old_expert_maps, self.num_local_experts) changed, priority, new_placement = self.calculate_rebalance_experts(load_info, old_placement) if not torch.is_tensor(new_placement): @@ -277,7 +276,6 @@ def update_expert_map(self, expert_maps): self.shared_dict["expert_maps"] = expert_maps def global2local(self, - workload: torch.Tensor, placement: torch.Tensor, E_local: int ) -> tuple[torch.Tensor, torch.Tensor]: @@ -285,10 +283,6 @@ def global2local(self, L, G, _ = placement.shape device = placement.device - wt_local = torch.full((L, G, E_local), - fill_value=-1, - dtype=workload.dtype, - device=device) pt_local = torch.full((L, G, E_local), fill_value=-1, dtype=torch.long, @@ -298,12 +292,10 @@ def global2local(self, l_idx, g_idx, k_idx = valid.nonzero(as_tuple=True) slot_idx = placement[l_idx, g_idx, k_idx] - values = workload[l_idx, g_idx, k_idx] - wt_local[l_idx, g_idx, slot_idx] = values pt_local[l_idx, g_idx, slot_idx] = k_idx - return wt_local, pt_local + return pt_local def local2global(self, diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 9e34cb4565..90a56859f1 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -82,8 +82,8 @@ def init_eplb(self, expert_map_path): shared_dict = self.shared_dict, planner_q = self.planner_block_queue, block_update_q = self.block_update_queue, - redundant_enable = self.redundant_enable, - policy_type = 6, + redundant_enable = self.redundant_enable, + policy_type = 1, enable_d2d = True ) @@ -93,8 +93,8 @@ def init_eplb(self, expert_map_path): def get_update_iteration(self): self.cur_iterations = self.cur_iterations + 1 - load_gather_iteration = self.cur_iterations % self.num_expert_load_gather == 0 if not self.gate_eplb else self.cur_iterations == self.num_iterations - upate_iteration = self.cur_iterations % self.num_iterations == 0 if not self.gate_eplb else self.cur_iterations == self.num_iterations + load_gather_iteration = self.cur_iterations % self.num_expert_load_gather == 0 if not self.gate_eplb else self.cur_iterations == self.num_iterations + upate_iteration = self.cur_iterations % self.num_iterations == 0 if not self.gate_eplb else self.cur_iterations == self.num_iterations return load_gather_iteration, upate_iteration def get_init_expert_map(self): @@ -137,7 +137,6 @@ def forward_before(self): def forward_end(self,dummy_run=False): - self.adaptor.collect_topk_ids(dummy_run) if not self.update_in_flight: load_gather_iteration, update_iteration = self.get_update_iteration() if load_gather_iteration: @@ -175,12 +174,12 @@ def compute_and_set_moe_load(self,dummy_run=False): moe_load = local_load.unsqueeze(1) self.shared_dict["moe_load"] = moe_load.cpu() logger.debug(f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}") + self.adaptor.model.clear_all_moe_loads() return moe_load def warm_up_eplb(self): self.get_init_expert_map() - self.adaptor.collect_topk_ids(dummy_run=False) self.compute_and_set_moe_load() src_tensor = torch.empty((1,), device=self.device) @@ -236,18 +235,18 @@ def unpack_update_batch(self, packed_update_info): return recovered def get_expert_load(self) -> str: - - load_info = self.shared_dict["load_info"] # Tensor [L, W, local_experts_num] + + load_info = self.shared_dict["load_info"] # Tensor [L, W, local_experts_num] L, W, _ = load_info.shape expert_load: Dict[str, List[dict]] = {} for c in range(W): layers: List[dict] = [] for l in range(L): - counts_1d = load_info[l, c] - + counts_1d = load_info[l, c] + layer_val = { - f"expert_{e}": int(v) + f"expert_{e}": int(v) for e, v in enumerate(counts_1d.tolist()) } layers.append({f"layer_{l}": layer_val}) diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index b70a239253..b6ec740698 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -727,6 +727,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config + self.num_dense_layers = self.config.first_k_dense_replace + self.num_moe_layers = self.config.num_hidden_layers - self.num_dense_layers + self.model = CustomDeepseekV2Model(vllm_config=vllm_config, prefix=maybe_prefix( prefix, "model")) @@ -755,13 +758,13 @@ def forward( inputs_embeds) return hidden_states - def get_expert_map(self,layer_id): + def get_expert_map(self, layer_id): return self.model.layers[layer_id].mlp.experts.get_map() - def get_log2phy_map(self,layer_id): + def get_log2phy_map(self, layer_id): return self.model.layers[layer_id].mlp.experts.get_log2phy_map() - def get_all_expert_map(self,num_moe_layers): + def get_all_expert_map(self, num_moe_layers): all_loads = [] for layer_id in range(num_moe_layers): load_tensor = self.get_expert_map(3+layer_id) # (num_experts_per_layer,) @@ -769,15 +772,17 @@ def get_all_expert_map(self,num_moe_layers): return torch.stack(all_loads, dim=0) - def get_topk_ids(self,layer_id): - return self.model.layers[layer_id+3].mlp.experts.topk_ids + def get_all_moe_loads(self): + all_moe_loads = torch.stack( + [self.model.layers[layer_id + self.num_dense_layers].mlp.experts.moe_load \ + for layer_id in range(self.num_moe_layers)], + dim=0 + ) + return all_moe_loads - def get_all_topk_ids(self,num_moe_layers): - all_topk_id = [] - for layer_id in range(num_moe_layers): - load_tensor = self.get_topk_ids(layer_id) - all_topk_id.append(load_tensor) - return all_topk_id + def clear_all_moe_loads(self): + for layer_id in range(self.num_moe_layers): + self.model.layers[layer_id + self.num_dense_layers].mlp.experts.clear_moe_load() class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM): pass diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index c5de0702b8..0142e6b16b 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -1012,8 +1012,6 @@ def __init__( AscendFusedMoE.moe_counter += 1 self.moe_instance_id = AscendFusedMoE.moe_counter - self.moe_load = None - self.topk_ids = None if params_dtype is None: params_dtype = torch.get_default_dtype() @@ -1103,6 +1101,10 @@ def __init__( local_num_experts = torch.sum(self.expert_map != -1) \ if self.expert_map is not None else num_experts + self.moe_load = None + if self.dynamic_eplb: + self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64) + moe_quant_params = { "num_experts": local_num_experts, "hidden_size": hidden_size, @@ -1176,7 +1178,7 @@ def forward(self, router_logits = get_dp_group().all_gather(router_logits, 0) # Matrix multiply. - e_hidden_states, self.topk_ids = self.quant_method.apply( + e_hidden_states, expert_token_num, group_list_type = self.quant_method.apply( layer=self, x=hidden_states, router_logits=router_logits, @@ -1198,6 +1200,10 @@ def forward(self, and self.enable_multistream_moe and not is_prefill else None, ) + if self.dynamic_eplb: + self.moe_load += expert_token_num if group_list_type else \ + torch.cat([expert_token_num[:1], expert_token_num[1:] - expert_token_num[:-1]]) + if shared_experts: if isinstance(e_hidden_states, tuple): e_hidden_states, shared_hidden_states = e_hidden_states @@ -1267,3 +1273,7 @@ def get_map(self): def get_log2phy_map(self): return self.log2phy + def clear_moe_load(self): + self.moe_load.zero_() + + diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index a1793977dd..4f165ed3eb 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -208,13 +208,14 @@ def fused_experts_with_mc2( hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2) + group_list_type = 1 if shared_experts is None: - return hidden_states + return hidden_states, expert_token_nums, group_list_type else: with npu_stream_switch("moe_secondary", 0): npu_wait_tensor(shared_act[0], down_out_list) shared_output, _ = shared_experts.down_proj(shared_act) - return hidden_states, shared_output + return hidden_states, shared_output, expert_token_nums, group_list_type # currently expert parallelism implemented with all2all @@ -343,7 +344,7 @@ def fused_experts_with_all2all( ) if len(original_shape) == 3: final_hidden_states = final_hidden_states.view(original_shape) - return final_hidden_states + return final_hidden_states, expert_tokens, group_list_type def fused_experts(hidden_states: torch.Tensor, @@ -457,7 +458,7 @@ def fused_experts(hidden_states: torch.Tensor, if len(original_shape) == 3: final_hidden_states = final_hidden_states.view(original_shape) - return final_hidden_states + return final_hidden_states, expert_tokens, group_list_type class AscendW8A8DynamicLinearMethod: @@ -677,7 +678,7 @@ def apply( log2phy=log2phy, global_redundant_expert_num=global_redundant_expert_num, shared_experts=shared_experts, - **kwargs), topk_ids + **kwargs) elif fused_moe_state == FusedMoEState.AllGather: return fused_experts(hidden_states=x, w1=layer.w13_weight, @@ -687,7 +688,7 @@ def apply( topk_weights=topk_weights, topk_ids=topk_ids, top_k=top_k, - expert_map=expert_map), topk_ids + expert_map=expert_map) else: # The current implementation of deepseek moe splits hidden_states # according to tp_size before they are feed into fused_moe module. @@ -706,7 +707,7 @@ def apply( ep_group=self.ep_group, log2phy=log2phy, global_redundant_expert_num=global_redundant_expert_num, - ), topk_ids + ) def process_weights_after_loading(self, layer): if self.transpose_weight: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 53f1f79f94..1a40e75830 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -366,7 +366,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.dp_size = vllm_config.parallel_config.data_parallel_size self.dp_rank = vllm_config.parallel_config.data_parallel_rank - #EPLB + #EPLB self.dynamic_eplb = ascend_config.dynamic_eplb if self.dynamic_eplb == True: self.eplb_adaptor = None @@ -1240,7 +1240,7 @@ def execute_model( if self.dynamic_eplb: self.eplb_updator.forward_before() - + (attn_metadata, hidden_states, spec_decode_metadata, positions, num_scheduled_tokens, sample_indices) = (self._process_reqs(scheduler_output, @@ -1544,6 +1544,8 @@ def _dummy_run( intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds) + if is_profile_run and self.dynamic_eplb: + self.model.clear_all_moe_loads() if not is_compile and not is_profile_run and self.dynamic_eplb: dummy_run = True self.eplb_updator.forward_end(dummy_run) From 89bcf04f2aedb78b99ac71de053c480d2dd78f9a Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Tue, 1 Jul 2025 21:56:56 +0800 Subject: [PATCH 151/153] modify serialization of eplb process --- vllm_ascend/eplb/core/worker/eplb_worker.py | 28 ++++++++------------- vllm_ascend/eplb/eplb_updator.py | 15 +++++------ 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/vllm_ascend/eplb/core/worker/eplb_worker.py b/vllm_ascend/eplb/core/worker/eplb_worker.py index 650feffe66..c4aa86a4ad 100644 --- a/vllm_ascend/eplb/core/worker/eplb_worker.py +++ b/vllm_ascend/eplb/core/worker/eplb_worker.py @@ -336,30 +336,22 @@ def pack_update_info(self, update_info_generator): for send_info, recv_info, new_expert_map, layer_id in update_info_generator: - send_all.append(send_info) - recv_all.append(recv_info) + send_info_this_rank = send_info[self.rank_id] if self.rank_id in send_info else [] + recv_info_this_rank = recv_info[self.rank_id] if self.rank_id in recv_info else [] + send_all.append(send_info_this_rank) + recv_all.append(recv_info_this_rank) - maps.append(new_expert_map[self.rank_id]) + maps.append(new_expert_map[self.rank_id].numpy().tolist()) - if self.redundant_enable is not None: + if self.redundant_enable: log2phy_map = ExpertMapUtils.generate_log2phy_map(new_expert_map) - log2phy_all.append(log2phy_map) + log2phy_all.append(log2phy_map[self.rank_id].numpy().tolist()) + else: + log2phy_all.append([]) layer_ids.append(layer_id) - # 把 list of Tensor 堆成一个大 Tensor - stacked_maps = torch.stack(maps, dim=0) - layer_id_tensor = torch.as_tensor(layer_ids, dtype=torch.int64) - stacked_maps.share_memory_() - layer_id_tensor.share_memory_() - - if self.redundant_enable: - stacked_log2phy = torch.stack(log2phy_all, dim=0) - stacked_log2phy.share_memory_() - else: - stacked_log2phy = None - - return send_all, recv_all, stacked_maps, stacked_log2phy, layer_id_tensor + return list(zip(send_all, recv_all, maps, log2phy_all, layer_ids)) class EplbProcess: def __init__(self, shared_dict, planner_q, block_update_q, redundant_enable, policy_type: int = 0, enable_d2d: bool = True): diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index 1ea109f348..ffd7f3ac8f 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -16,6 +16,7 @@ # import torch +import numpy from typing import Dict, List import torch.distributed as dist import vllm.envs as envs @@ -111,19 +112,19 @@ def forward_before(self): # Batch after eplb process being triggered, get update info provided by eplb process if self.update_in_flight and self.weight_update_counter == 0 and self.wait_worker_iterations == self.num_wait_worker_iterations: self.wait_worker_iterations = 0 - packed_update_info = self.block_update_queue.get() - self.update_info_all = self.unpack_update_batch(packed_update_info) + self.update_info_all = self.block_update_queue.get() self.weight_loading = True if self.update_in_flight and self.weight_loading and self.weight_update_counter < self.num_moe_layers: (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(0) rank_id = torch.distributed.get_rank() - self.eplb_loader.set_log2phy_map(log2phy_map) - expert_send_info_this_rank = expert_send_info[rank_id] if rank_id in expert_send_info else [] - expert_recv_info_this_rank = expert_recv_info[rank_id] if rank_id in expert_recv_info else [] + if self.redundant_enable: + log2phy_map_this_rank = torch.from_numpy(numpy.array(log2phy_map)) + self.eplb_loader.set_log2phy_map(log2phy_map_this_rank) + updated_expert_map_this_rank = torch.from_numpy(numpy.array(updated_expert_map)) #logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") - self.eplb_loader.generate_expert_d2d_transfer_task(expert_send_info_this_rank, - expert_recv_info_this_rank, updated_expert_map, layer_id + 3) + self.eplb_loader.generate_expert_d2d_transfer_task(expert_send_info, expert_recv_info, + updated_expert_map_this_rank, layer_id + self.adaptor.num_dense_layers) self.weight_update_counter += 1 if self.weight_update_counter == self.num_moe_layers: self.weight_update_counter = 0 From 2b62a4729ae5dd9e0930dc211c021de5155b2133 Mon Sep 17 00:00:00 2001 From: wanghanqing_lyt Date: Thu, 3 Jul 2025 09:54:52 +0800 Subject: [PATCH 152/153] improve d2d expert weight update impl in eplb_updator.py --- vllm_ascend/eplb/eplb_updator.py | 83 +++++++++++++-------------- vllm_ascend/worker/model_runner_v1.py | 3 +- 2 files changed, 40 insertions(+), 46 deletions(-) diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py index ffd7f3ac8f..dbdf9a11cf 100644 --- a/vllm_ascend/eplb/eplb_updator.py +++ b/vllm_ascend/eplb/eplb_updator.py @@ -40,20 +40,20 @@ def set_adaptor(self, adaptor): def init_eplb(self, expert_map_path): self.num_expert_load_gather = 10 - self.redundant_enable = (expert_map_path != None) - self.num_iterations: torch.int64 = 130 + self.periodic_load_gather = True + self.redundant_enable = (expert_map_path is not None) + self.num_iterations_eplb_update: torch.int64 = 130 self.expert_map_path = expert_map_path try: if not envs.VLLM_ALLOW_EXPERT_LOAD_COLLECTING: - self.num_expert_load_gather = self.num_iterations + self.num_expert_load_gather = self.num_iterations_eplb_update + self.periodic_load_gather = False except Exception as e: - self.num_expert_load_gather = self.num_iterations + self.num_expert_load_gather = self.num_iterations_eplb_update + self.periodic_load_gather = False - self.weight_update_counter = 0 self.expert_map_initialized = False - self.update_in_flight = False - self.gate_eplb = True self.reqs = [] @@ -61,7 +61,6 @@ def init_eplb(self, expert_map_path): self.cur_iterations: torch.int64 = 0 - self.wait_worker_iterations: torch.int64 = 0 self.num_wait_worker_iterations: torch.int64 = 20 self.planner_block_queue = Queue() @@ -90,11 +89,22 @@ def init_eplb(self, expert_map_path): logger.info(f"[ModelRunner] Launched EPLB process (pid={self.eplb_process.pid})") - def get_update_iteration(self): - self.cur_iterations = self.cur_iterations + 1 - load_gather_iteration = self.cur_iterations % self.num_expert_load_gather == 0 if not self.gate_eplb else self.cur_iterations == self.num_iterations - upate_iteration = self.cur_iterations % self.num_iterations == 0 if not self.gate_eplb else self.cur_iterations == self.num_iterations - return load_gather_iteration, upate_iteration + def update_iteration(self): + self.cur_iterations += 1 + if self.cur_iterations == (self.num_iterations_eplb_update +\ + self.num_wait_worker_iterations + self.num_moe_layers): + if not self.gate_eplb: + self.cur_iterations = 0 + + def get_update_info_flag(self): + return self.cur_iterations == (self.num_iterations_eplb_update + self.num_wait_worker_iterations) + + def wakeup_eplb_worker_flag(self): + return self.cur_iterations == (self.num_iterations_eplb_update - 1) + + def update_expert_weight_flag(self): + weight_update_counter = self.cur_iterations - (self.num_iterations_eplb_update + self.num_wait_worker_iterations) + return (weight_update_counter >= 0 and weight_update_counter < self.num_moe_layers) def get_init_expert_map(self): try: @@ -108,14 +118,11 @@ def wakeup_eplb_worker(self): self.planner_block_queue.put(1) def forward_before(self): - # Batch after eplb process being triggered, get update info provided by eplb process - if self.update_in_flight and self.weight_update_counter == 0 and self.wait_worker_iterations == self.num_wait_worker_iterations: - self.wait_worker_iterations = 0 + if self.get_update_info_flag(): self.update_info_all = self.block_update_queue.get() - self.weight_loading = True - if self.update_in_flight and self.weight_loading and self.weight_update_counter < self.num_moe_layers: + if self.update_expert_weight_flag(): (expert_send_info, expert_recv_info, updated_expert_map, log2phy_map, layer_id) = self.update_info_all.pop(0) rank_id = torch.distributed.get_rank() if self.redundant_enable: @@ -125,34 +132,22 @@ def forward_before(self): #logger.info(f"check update info, layer = {layer_id}, send = {expert_send_info_this_rank}, recv = {expert_recv_info_this_rank}") self.eplb_loader.generate_expert_d2d_transfer_task(expert_send_info, expert_recv_info, updated_expert_map_this_rank, layer_id + self.adaptor.num_dense_layers) - self.weight_update_counter += 1 - if self.weight_update_counter == self.num_moe_layers: - self.weight_update_counter = 0 - self.update_in_flight = False - self.update_info_all = [] - # set asynchronous stream for d2d expert weight update - self.reqs = [] - self.eplb_loader.asyn_expert_weight_transfer(self.reqs) + # set asynchronous stream for d2d expert weight update + self.reqs = [] + self.eplb_loader.asyn_expert_weight_transfer(self.reqs) - def forward_end(self,dummy_run=False): - if not self.update_in_flight: - load_gather_iteration, update_iteration = self.get_update_iteration() - if load_gather_iteration: - moe_load = self.compute_and_set_moe_load() - self.get_expert_load() - if update_iteration: - self.wakeup_eplb_worker() - self.update_in_flight = True - self.wait_worker_iterations = 0 - self.weight_loading = False + def forward_end(self): + if self.wakeup_eplb_worker_flag(): + moe_load = self.compute_and_set_moe_load(is_clear=True) + self.wakeup_eplb_worker() - if self.update_in_flight: - self.wait_worker_iterations = self.wait_worker_iterations + 1 + if self.update_expert_weight_flag(): + self.eplb_loader.update_expert_map_and_weight(self.reqs, self.redundant_enable) - self.eplb_loader.update_expert_map_and_weight(self.reqs, self.redundant_enable) + self.update_iteration() - def compute_and_set_moe_load(self,dummy_run=False): + def compute_and_set_moe_load(self, is_clear=False): local_load = self.adaptor.get_rank_expert_workload() self._gather_buffer = None @@ -241,10 +236,10 @@ def get_expert_load(self) -> tuple: return moe_load, expert_maps, num_local_experts def update_expert_load_statistical_period(self, num_expert_load_gather: int, num_iterations: int): - logger.info(f" start update {self.num_expert_load_gather=}, {self.num_iterations}...") + logger.info(f" start update {self.num_expert_load_gather=}, {self.num_iterations_eplb_update}...") self.num_expert_load_gather = num_expert_load_gather - self.num_iterations = num_iterations - logger.info(f" update {self.num_expert_load_gather=}, {self.num_iterations} success...") + self.num_iterations_eplb_update = num_iterations + logger.info(f" update {self.num_expert_load_gather=}, {self.num_iterations_eplb_update} success...") def shutdown(self): """ diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 90ff0d6693..1dc0a0ebd5 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1547,8 +1547,7 @@ def _dummy_run( if is_profile_run and self.dynamic_eplb: self.model.clear_all_moe_loads() if not is_compile and not is_profile_run and self.dynamic_eplb: - dummy_run = True - self.eplb_updator.forward_end(dummy_run) + self.eplb_updator.forward_end() return hidden_states def profile_run(self) -> None: From 3dc10efa477a688e3b9b338505e8803e3880a40a Mon Sep 17 00:00:00 2001 From: "yangcheng (AJ)" Date: Tue, 8 Jul 2025 10:48:13 +0800 Subject: [PATCH 153/153] merge update --- .github/dependabot.yml | 3 - .github/workflows/image_openeuler.yml | 19 +- .github/workflows/image_ubuntu.yml | 11 +- .github/workflows/nightly_benchmarks.yaml | 8 +- .github/workflows/vllm_ascend_doctest.yaml | 21 +- .github/workflows/vllm_ascend_test.yaml | 251 ++---- .../workflows/vllm_ascend_test_long_term.yaml | 23 +- .github/workflows/vllm_ascend_test_pd.yaml | 9 +- README.md | 2 +- README.zh.md | 2 +- docs/requirements-test.txt | 2 +- docs/source/community/contributors.md | 1 - docs/source/community/user_stories/index.md | 19 - .../community/user_stories/llamafactory.md | 19 - docs/source/faqs.md | 5 +- docs/source/index.md | 8 +- docs/source/installation.md | 5 +- docs/source/user_guide/additional_config.md | 1 - docs/source/user_stories/example.md | 15 + docs/source/user_stories/index.md | 22 + examples/disaggregate_prefill_v1/README.md | 234 ++++++ .../disaggregate_prefill_v1/gen_ranktable.py | 120 +++ .../disaggregate_prefill_v1/gen_ranktable.sh | 79 ++ .../disaggregate_prefill_v1/run_server.sh | 32 + .../toy_proxy_server.py | 261 ++++++ examples/dp_offline/data_parallel.py | 255 ++++-- examples/dp_offline/run_dp.sh | 35 +- examples/offline_dualbatch_overlap_npu.py | 2 +- examples/run_dp_server.sh | 37 +- ...6.sh => run_dp_with_cached_graph_etp16.sh} | 34 +- format.sh | 2 +- pyproject.toml | 4 +- requirements-dev.txt | 1 - requirements.txt | 5 +- tests/e2e/common.sh | 25 +- tests/e2e/doctests/001-quickstart-test.sh | 2 + .../002-pip-binary-installation-test.sh | 42 - .../e2e/pd_disaggreate/run_edge_case_test.sh | 141 ++++ tests/e2e/pd_disaggreate/test_edge_cases.py | 81 ++ tests/e2e/run_doctests.sh | 6 - .../spec_decode_v0}/__init__.py | 0 .../spec_decode_v0}/conftest.py | 0 .../spec_decode_v0}/e2e/__init__.py | 0 .../spec_decode_v0}/e2e/conftest.py | 0 .../e2e/test_eagle_correctness.py | 344 ++++++++ .../e2e/test_medusa_correctness.py | 4 +- .../e2e/test_mlp_correctness.py | 4 +- .../e2e/test_mtp_correctness.py | 0 .../e2e/test_ngram_correctness.py | 4 +- .../test_dynamic_spec_decode.py | 4 +- .../spec_decode_v0}/test_multi_step_worker.py | 2 +- .../spec_decode_v0}/test_ngram_worker.py | 2 +- .../test_spec_decode_worker.py | 8 +- .../spec_decode_v0}/test_utils.py | 0 .../spec_decode_v0}/utils.py | 0 .../test_v1_mtp_correctness.py | 65 +- .../spec_decode_v1}/test_v1_spec_decode.py | 0 tests/{e2e => }/long_term/test_accuracy.py | 0 .../test_deepseek_v2_lite_tp2_accuracy.py | 2 +- tests/multicard/test_data_parallel.py | 2 +- .../test_dynamic_npugraph_batchsize.py | 0 .../multicard/test_ilama_lora_tp2.py | 4 +- tests/multicard/test_model_qwen3_w4a8.py | 65 ++ .../test_offline_inference_distributed.py | 33 + .../multicard/test_pyhccl_distributed.py | 0 .../multicard/test_torchair_graph_mode.py | 13 +- tests/multicard/test_w4a8_deepseek.py | 67 ++ .../ops/test_vocabparallelembedding.py | 0 tests/{e2e => }/singlecard/__init__.py | 0 .../{e2e => }/singlecard/compile/__init__.py | 0 .../singlecard/compile/test_simple.py | 0 tests/{e2e => }/singlecard/core/__init__.py | 0 .../singlecard/core/test_ascend_scheduler.py | 76 +- .../core/test_ascend_scheduler_e2e.py | 0 tests/{e2e => }/singlecard/ops/__init__.py | 0 .../singlecard/ops/test_fused_moe.py | 0 .../singlecard/ops/test_multi_step.py | 0 .../singlecard/ops/test_rotary_embedding.py | 0 tests/{e2e => }/singlecard/sample/__init__.py | 0 .../sample/test_rejection_sampler.py | 0 tests/{e2e => }/singlecard/test_aclgraph.py | 12 +- tests/singlecard/test_ascend_config.py | 188 +++++ tests/{e2e => }/singlecard/test_camem.py | 0 tests/{e2e => }/singlecard/test_chunked.py | 0 .../singlecard/test_guided_decoding.py | 0 tests/{e2e => }/singlecard/test_ilama_lora.py | 0 .../singlecard/test_offline_inference.py | 4 + .../test_profile_execute_duration.py | 0 .../singlecard/test_prompt_embedding.py | 0 tests/{e2e => }/singlecard/test_pyhccl.py | 0 tests/{e2e => }/singlecard/test_sampler.py | 0 tests/{e2e => }/singlecard/test_scheduler.py | 24 +- tests/ut/fake_weight/config.json | 28 - .../test_llmdatadist_connector.py | 42 + .../test_remote_decode_lifecycle.py | 123 +++ .../test_remote_prefill_lifecycle.py | 242 ++++++ tests/ut/kv_connector/utils.py | 194 +++++ tests/ut/ops/test_expert_load_balancer.py | 147 ---- tests/ut/test_ascend_config.py | 244 ------ tests/ut/worker/test_worker_v1.py | 1 - vllm_ascend/ascend_config.py | 12 +- vllm_ascend/ascend_forward_context.py | 79 ++ vllm_ascend/attention/attention.py | 11 - vllm_ascend/attention/attention_v1.py | 181 +++- vllm_ascend/attention/mla_v1.py | 317 +++---- vllm_ascend/attention/utils.py | 23 + vllm_ascend/compilation/piecewise_backend.py | 52 ++ vllm_ascend/core/scheduler.py | 71 +- vllm_ascend/device_allocator/camem.py | 2 +- vllm_ascend/distributed/__init__.py | 5 + .../llmdatadist_c_mgr_connector.py | 789 ++++++++++++++++++ vllm_ascend/distributed/parallel_state.py | 77 -- vllm_ascend/envs.py | 21 + vllm_ascend/models/__init__.py | 11 +- vllm_ascend/models/deepseek_dbo.py | 625 +++++++------- vllm_ascend/models/deepseek_mtp.py | 23 +- vllm_ascend/models/deepseek_v2.py | 56 +- vllm_ascend/multistream/base.py | 2 + vllm_ascend/multistream/metadata.py | 2 + vllm_ascend/multistream/ms_split.py | 14 +- vllm_ascend/ops/attention.py | 34 +- vllm_ascend/ops/fused_moe.py | 140 ++-- vllm_ascend/ops/rotary_embedding.py | 21 +- vllm_ascend/patch/__init__.py | 39 +- .../patch_common/patch_distributed.py | 22 - .../patch_common/patch_multi_step_worker.py | 16 - .../patch_common/patch_spec_decode_worker.py | 12 +- vllm_ascend/platform.py | 24 +- vllm_ascend/quantization/quant_config.py | 40 + vllm_ascend/quantization/quantizer.py | 14 + vllm_ascend/quantization/w4a8_dynamic.py | 377 +++++++++ vllm_ascend/quantization/w8a8.py | 14 +- vllm_ascend/quantization/w8a8_dynamic.py | 289 +++++-- vllm_ascend/soc_info.py | 14 + vllm_ascend/utils.py | 94 ++- vllm_ascend/worker/draft_model_runner.py | 19 +- vllm_ascend/worker/model_runner.py | 10 +- vllm_ascend/worker/model_runner_v1.py | 596 +++++++++---- vllm_ascend/worker/mtp_proposer_v1.py | 195 ++++- vllm_ascend/worker/pooling_model_runner.py | 6 +- vllm_ascend/worker/worker.py | 9 +- vllm_ascend/worker/worker_v1.py | 27 +- 142 files changed, 6026 insertions(+), 2151 deletions(-) delete mode 100644 docs/source/community/user_stories/index.md delete mode 100644 docs/source/community/user_stories/llamafactory.md create mode 100644 docs/source/user_stories/example.md create mode 100644 docs/source/user_stories/index.md create mode 100644 examples/disaggregate_prefill_v1/README.md create mode 100644 examples/disaggregate_prefill_v1/gen_ranktable.py create mode 100644 examples/disaggregate_prefill_v1/gen_ranktable.sh create mode 100644 examples/disaggregate_prefill_v1/run_server.sh create mode 100644 examples/disaggregate_prefill_v1/toy_proxy_server.py rename examples/{run_dp_attention_etp16.sh => run_dp_with_cached_graph_etp16.sh} (53%) delete mode 100644 tests/e2e/doctests/002-pip-binary-installation-test.sh create mode 100644 tests/e2e/pd_disaggreate/run_edge_case_test.sh create mode 100644 tests/e2e/pd_disaggreate/test_edge_cases.py rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/__init__.py (100%) rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/conftest.py (100%) rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/e2e/__init__.py (100%) rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/e2e/conftest.py (100%) create mode 100644 tests/long_term/spec_decode_v0/e2e/test_eagle_correctness.py rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/e2e/test_medusa_correctness.py (99%) rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/e2e/test_mlp_correctness.py (99%) rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/e2e/test_mtp_correctness.py (100%) rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/e2e/test_ngram_correctness.py (99%) rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/test_dynamic_spec_decode.py (96%) rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/test_multi_step_worker.py (99%) rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/test_ngram_worker.py (99%) rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/test_spec_decode_worker.py (99%) rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/test_utils.py (100%) rename tests/{e2e/long_term/spec_decode => long_term/spec_decode_v0}/utils.py (100%) rename tests/{e2e/long_term/spec_decode/e2e => long_term/spec_decode_v1}/test_v1_mtp_correctness.py (54%) rename tests/{e2e/long_term/spec_decode/e2e => long_term/spec_decode_v1}/test_v1_spec_decode.py (100%) rename tests/{e2e => }/long_term/test_accuracy.py (100%) rename tests/{e2e => }/long_term/test_deepseek_v2_lite_tp2_accuracy.py (97%) rename tests/{e2e => }/multicard/test_dynamic_npugraph_batchsize.py (100%) rename tests/{e2e => }/multicard/test_ilama_lora_tp2.py (83%) create mode 100644 tests/multicard/test_model_qwen3_w4a8.py rename tests/{e2e => }/multicard/test_offline_inference_distributed.py (72%) rename tests/{e2e => }/multicard/test_pyhccl_distributed.py (100%) rename tests/{e2e => }/multicard/test_torchair_graph_mode.py (85%) create mode 100644 tests/multicard/test_w4a8_deepseek.py rename tests/{e2e/singlecard => }/ops/test_vocabparallelembedding.py (100%) rename tests/{e2e => }/singlecard/__init__.py (100%) rename tests/{e2e => }/singlecard/compile/__init__.py (100%) rename tests/{e2e => }/singlecard/compile/test_simple.py (100%) rename tests/{e2e => }/singlecard/core/__init__.py (100%) rename tests/{e2e => }/singlecard/core/test_ascend_scheduler.py (95%) rename tests/{e2e => }/singlecard/core/test_ascend_scheduler_e2e.py (100%) rename tests/{e2e => }/singlecard/ops/__init__.py (100%) rename tests/{e2e => }/singlecard/ops/test_fused_moe.py (100%) rename tests/{e2e => }/singlecard/ops/test_multi_step.py (100%) rename tests/{e2e => }/singlecard/ops/test_rotary_embedding.py (100%) rename tests/{e2e => }/singlecard/sample/__init__.py (100%) rename tests/{e2e => }/singlecard/sample/test_rejection_sampler.py (100%) rename tests/{e2e => }/singlecard/test_aclgraph.py (88%) create mode 100644 tests/singlecard/test_ascend_config.py rename tests/{e2e => }/singlecard/test_camem.py (100%) rename tests/{e2e => }/singlecard/test_chunked.py (100%) rename tests/{e2e => }/singlecard/test_guided_decoding.py (100%) rename tests/{e2e => }/singlecard/test_ilama_lora.py (100%) rename tests/{e2e => }/singlecard/test_offline_inference.py (98%) rename tests/{e2e => }/singlecard/test_profile_execute_duration.py (100%) rename tests/{e2e => }/singlecard/test_prompt_embedding.py (100%) rename tests/{e2e => }/singlecard/test_pyhccl.py (100%) rename tests/{e2e => }/singlecard/test_sampler.py (100%) rename tests/{e2e => }/singlecard/test_scheduler.py (95%) delete mode 100644 tests/ut/fake_weight/config.json create mode 100644 tests/ut/kv_connector/test_llmdatadist_connector.py create mode 100644 tests/ut/kv_connector/test_remote_decode_lifecycle.py create mode 100644 tests/ut/kv_connector/test_remote_prefill_lifecycle.py create mode 100644 tests/ut/kv_connector/utils.py delete mode 100644 tests/ut/ops/test_expert_load_balancer.py delete mode 100644 tests/ut/test_ascend_config.py delete mode 100644 tests/ut/worker/test_worker_v1.py create mode 100644 vllm_ascend/ascend_forward_context.py create mode 100644 vllm_ascend/attention/utils.py create mode 100644 vllm_ascend/distributed/llmdatadist_c_mgr_connector.py delete mode 100644 vllm_ascend/distributed/parallel_state.py create mode 100644 vllm_ascend/quantization/w4a8_dynamic.py create mode 100644 vllm_ascend/soc_info.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 1f4a226933..0eaacea835 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -2,9 +2,6 @@ version: 2 updates: - package-ecosystem: "github-actions" directory: "/" - schedule: - # Check for updates to GitHub Actions every week - interval: "weekly" open-pull-requests-limit: 2 reviewers: - "Yikun" diff --git a/.github/workflows/image_openeuler.yml b/.github/workflows/image_openeuler.yml index 690d8145cd..17800206e7 100644 --- a/.github/workflows/image_openeuler.yml +++ b/.github/workflows/image_openeuler.yml @@ -1,4 +1,4 @@ -name: 'image / openEuler' +name: 'image' # This is a docker build check and publish job: # 1. PR Triggered docker image build check # - is for image build check @@ -39,13 +39,9 @@ on: jobs: build: - name: vllm-ascend image build - runs-on: >- - ${{ - github.event_name == 'push' && github.repository_owner == 'vllm-project' && - 'ubuntu-latest' || - 'ubuntu-24.04-arm' - }} + name: vllm-ascend openEuler image + runs-on: ubuntu-latest + steps: - uses: actions/checkout@v4 @@ -97,12 +93,7 @@ jobs: - name: Build and push uses: docker/build-push-action@v6 with: - platforms: >- - ${{ - github.event_name == 'push' && github.repository_owner == 'vllm-project' && - 'linux/amd64,linux/arm64' || - 'linux/arm64' - }} + platforms: linux/amd64,linux/arm64 # use the current repo path as the build context, ensure .git is contained context: . # only trigger when tag, branch/main push diff --git a/.github/workflows/image_ubuntu.yml b/.github/workflows/image_ubuntu.yml index a2cfbcefb0..5fcf32e813 100644 --- a/.github/workflows/image_ubuntu.yml +++ b/.github/workflows/image_ubuntu.yml @@ -1,4 +1,4 @@ -name: 'image / Ubuntu' +name: 'image' # This is a docker build check and publish job: # 1. PR Triggered docker image build check # - is for image build check @@ -39,7 +39,7 @@ on: jobs: build: - name: vllm-ascend image build + name: vllm-ascend Ubuntu image runs-on: ubuntu-latest steps: @@ -93,12 +93,7 @@ jobs: - name: Build and push uses: docker/build-push-action@v6 with: - platforms: >- - ${{ - github.event_name == 'push' && github.repository_owner == 'vllm-project' && - 'linux/amd64,linux/arm64' || - 'linux/amd64' - }} + platforms: linux/amd64,linux/arm64 # use the current repo path as the build context, ensure .git is contained context: . # only trigger when tag, branch/main push diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 2b9c062957..f770d1d992 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -18,11 +18,7 @@ name: 'Benchmarks / Performance' # This workflow runs nightly benchmarks for vllm-ascend. -on: - schedule: - # Run at 02:00 everyday - - cron: '00 18 * * *' - +on: workflow_dispatch: # Allow manual triggering of the workflow @@ -167,8 +163,6 @@ jobs: cp -r benchmarks/* /github/home/benchmarks/ - name: Run benchmark iteration - env: - PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi if: github.event_name != 'pull_request' run: | while IFS= read -r line || [[ -n "$line" ]]; do diff --git a/.github/workflows/vllm_ascend_doctest.yaml b/.github/workflows/vllm_ascend_doctest.yaml index 67f98fbaf7..5f40a2b9e2 100644 --- a/.github/workflows/vllm_ascend_doctest.yaml +++ b/.github/workflows/vllm_ascend_doctest.yaml @@ -29,9 +29,6 @@ on: - 'tests/e2e/doctests/**' - 'tests/e2e/common.sh' - 'tests/e2e/run_doctests.sh' - schedule: - # Runs every 4 hours - - cron: '0 */4 * * *' # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. @@ -46,7 +43,7 @@ jobs: # Each version should be tested fail-fast: false matrix: - vllm_verison: [v0.9.1-dev, v0.9.1-dev-openeuler, main, main-openeuler] + vllm_verison: [main, v0.7.3-dev, main-openeuler, v0.7.3-dev-openeuler] name: vLLM Ascend test runs-on: linux-arm64-npu-1 container: @@ -70,13 +67,13 @@ jobs: run: | sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list apt-get update -y - apt install -y gcc g++ libnuma-dev git curl jq + apt install git curl -y - name: Config OS mirrors - openEuler if: ${{ endsWith(matrix.vllm_verison, '-openeuler') }} run: | yum update -y - yum install -y gcc g++ numactl-devel git curl jq + yum install git curl -y - name: Config pip mirrors run: | @@ -88,13 +85,11 @@ jobs: - name: Run vllm-ascend/tests/e2e/run_doctests.sh run: | # PWD: /__w/vllm-ascend/vllm-ascend - echo "Replacing /vllm-workspace/vllm-ascend/tests/e2e ..." - rm -rf /vllm-workspace/vllm-ascend/tests/e2e - mkdir -p /vllm-workspace/vllm-ascend/tests - cp -r tests/e2e /vllm-workspace/vllm-ascend/tests/ - - # TODO(yikun): Remove this after conf.py merged - cp docs/source/conf.py /vllm-workspace/vllm-ascend/docs/source/ + # Address old branch like v0.7.3: + if [ ! -d /vllm-workspace/vllm-ascend/tests/e2e ]; then + echo "Warning: the doctest path doesn't exists, copy now" + cp -r tests/e2e /vllm-workspace/vllm-ascend/tests/ + fi # Simulate container to enter directory cd /workspace diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 540680dd2f..97d316e755 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -18,8 +18,6 @@ name: 'test' on: - schedule: - - cron: '0 23 * * *' pull_request: branches: - 'main' @@ -44,12 +42,6 @@ defaults: run: shell: bash -el {0} -# only cancel in-progress runs of the same workflow -# and ignore the lint / 1 card / 4 cards test type -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - jobs: lint: runs-on: ubuntu-latest @@ -114,171 +106,32 @@ jobs: echo "::add-matcher::.github/workflows/matchers/mypy.json" tools/mypy.sh 1 ${{ matrix.python-version }} - ut: - needs: [lint] - name: unit test - if: ${{ needs.lint.result == 'success' }} - runs-on: ubuntu-latest - container: - image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 - env: - VLLM_LOGGING_LEVEL: ERROR - VLLM_USE_MODELSCOPE: True - strategy: - matrix: - vllm_version: [main, v0.9.1] - steps: - - name: Install packages - run: | - apt-get update -y - apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev - - - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v4 - with: - repository: vllm-project/vllm - ref: ${{ matrix.vllm_version }} - path: ./vllm-empty - - - name: Install vllm-project/vllm from source - working-directory: ./vllm-empty - run: | - VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/ - python3 -m pip uninstall -y triton - - - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v4 - - - name: Install vllm-project/vllm-ascend - run: | - export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib - python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/ - python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/ - - - name: Run unit test for V1 Engine - env: - VLLM_USE_V1: 1 - VLLM_WORKER_MULTIPROC_METHOD: spawn - TORCH_DEVICE_BACKEND_AUTOLOAD: 0 - run: | - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib - pytest -sv tests/ut - e2e: needs: [lint] if: ${{ needs.lint.result == 'success' }} strategy: max-parallel: 2 matrix: - os: [linux-arm64-npu-1] - vllm_version: [main, v0.9.1] - name: singlecard e2e test - runs-on: ${{ matrix.os }} - container: - # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready - image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 - env: - VLLM_LOGGING_LEVEL: ERROR - steps: - - name: Check npu and CANN info - run: | - npu-smi info - cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info - - - name: Config mirrors - run: | - sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list - pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple - apt-get update -y - apt install git -y - git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ - - - name: Checkout vllm-project/vllm-ascend repo - uses: actions/checkout@v4 - - - name: Install system dependencies - run: | - apt-get -y install `cat packages.txt` - apt-get -y install gcc g++ cmake libnuma-dev - - - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v4 - with: - repository: vllm-project/vllm - ref: ${{ matrix.vllm_version }} - path: ./vllm-empty - - - name: Install vllm-project/vllm from source - working-directory: ./vllm-empty - run: | - VLLM_TARGET_DEVICE=empty pip install -e . - - - name: Install vllm-project/vllm-ascend - env: - PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi - run: | - pip install -r requirements-dev.txt - pip install -v -e . - - - name: Run e2e test for V1 Engine - env: - VLLM_USE_V1: 1 - VLLM_WORKER_MULTIPROC_METHOD: spawn - VLLM_USE_MODELSCOPE: True - run: | - pytest -sv tests/e2e/singlecard/test_offline_inference.py - # TODO: switch hf to modelscope - VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ - pytest -sv tests/e2e/singlecard/test_ilama_lora.py - # TODO(sss): guided decoding doesn't work, fix it later - # pytest -sv tests/e2e/singlecard/test_guided_decoding.py - pytest -sv tests/e2e/singlecard/test_camem.py - pytest -sv tests/e2e/singlecard/ \ - --ignore=tests/e2e/singlecard/test_offline_inference.py \ - --ignore=tests/e2e/singlecard/test_ilama_lora.py \ - --ignore=tests/e2e/singlecard/test_guided_decoding.py \ - --ignore=tests/e2e/singlecard/test_camem.py - - - name: Run e2e test on V0 engine - if: ${{ github.event_name == 'schedule' }} - env: - VLLM_USE_V1: 0 - VLLM_USE_MODELSCOPE: True - run: | - pytest -sv tests/e2e/singlecard/test_offline_inference.py - # TODO: switch hf to modelscope - VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ - pytest -sv tests/e2e/singlecard/test_ilama_lora.py - # guided decoding doesn't work, fix it later - # pytest -sv tests/e2e/singlecard/test_guided_decoding.py - pytest -sv tests/e2e/singlecard/test_camem.py - pytest -sv tests/e2e/singlecard/test_prompt_embedding.py - pytest -sv tests/e2e/singlecard/ \ - --ignore=tests/e2e/singlecard/test_offline_inference.py \ - --ignore=tests/e2e/singlecard/test_ilama_lora.py \ - --ignore=tests/e2e/singlecard/test_guided_decoding.py \ - --ignore=tests/e2e/singlecard/test_camem.py \ - --ignore=tests/e2e/singlecard/test_prompt_embedding.py \ - --ignore=tests/e2e/singlecard/core/test_ascend_scheduler.py \ - --ignore=tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py - - e2e-4-cards: - needs: [e2e] - if: ${{ needs.e2e.result == 'success' }} - strategy: - max-parallel: 1 - matrix: - os: [linux-arm64-npu-4] - vllm_version: [main, v0.9.1] - name: multicard e2e test + os: [linux-arm64-npu-1, linux-arm64-npu-4] + vllm_version: [v0.9.1] + concurrency: + group: > + ${{ + matrix.os == 'linux-arm64-npu-4' + && github.event.pull_request.number + && format('pr-{0}-limit-npu-4', github.event.pull_request.number) + || format('job-{0}-{1}-{2}', matrix.os, matrix.vllm_version, github.event.pull_request.number) + }} + cancel-in-progress: false + name: vLLM Ascend test runs-on: ${{ matrix.os }} container: # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 env: + HF_ENDPOINT: https://hf-mirror.com + HF_TOKEN: ${{ secrets.HF_TOKEN }} VLLM_LOGGING_LEVEL: ERROR - VLLM_USE_MODELSCOPE: True steps: - name: Check npu and CANN info run: | @@ -324,32 +177,64 @@ jobs: env: VLLM_USE_V1: 1 VLLM_WORKER_MULTIPROC_METHOD: spawn - VLLM_USE_MODELSCOPE: True run: | - # TODO: switch hf to modelscope - VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ - pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py - # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error. - # To avoid oom, we need to run the test in a single process. - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 - pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py --ignore=tests/e2e/multicard/test_offline_inference_distributed.py + if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then + VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py + # guided decoding doesn't work, fix it later + # pytest -sv tests/singlecard/test_guided_decoding.py.py + # test_ascend_config.py should be ran separately because it will regenerate the global config many times. + pytest -sv tests/singlecard/test_ascend_config.py + pytest -sv tests/singlecard/test_camem.py + pytest -sv tests/singlecard/core/test_ascend_scheduler.py + pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py + pytest -sv tests/singlecard/ \ + --ignore=tests/singlecard/test_offline_inference.py \ + --ignore=tests/singlecard/test_guided_decoding.py \ + --ignore=tests/singlecard/test_ascend_config.py \ + --ignore=tests/singlecard/test_camem.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py + else + pytest -sv tests/multicard/test_ilama_lora_tp2.py + # To avoid oom, we need to run the test in a single process. + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_w4a8_deepseek.py::test_deepseek_W4A8 + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py --ignore=tests/multicard/test_w4a8_deepseek.py + fi - name: Run vllm-project/vllm-ascend test on V0 engine if: ${{ github.event_name == 'schedule' }} env: VLLM_USE_V1: 0 - VLLM_USE_MODELSCOPE: True run: | - # TODO: switch hf to modelscope - VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \ - pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py - # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error. - # To avoid oom, we need to run the test in a single process. - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk - pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 - pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py --ignore=tests/e2e/multicard/test_offline_inference_distributed.py + if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then + VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py + # guided decoding doesn't work, fix it later + # pytest -sv tests/singlecard/test_guided_decoding.py.py + pytest -sv tests/singlecard/test_camem.py + # test_ascend_config.py should be ran separately because it will regenerate the global config many times. + pytest -sv tests/singlecard/test_ascend_config.py + pytest -sv tests/singlecard/test_prompt_embedding.py + pytest -sv tests/singlecard/ \ + --ignore=tests/singlecard/test_offline_inference.py \ + --ignore=tests/singlecard/test_guided_decoding.py \ + --ignore=tests/singlecard/test_camem.py \ + --ignore=tests/singlecard/test_ascend_config.py \ + --ignore=tests/singlecard/test_prompt_embedding.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py + else + pytest -sv tests/multicard/test_ilama_lora_tp2.py + # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error. + # To avoid oom, we need to run the test in a single process. + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py + fi diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index e249849e19..1b07fbc3be 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -17,9 +17,6 @@ name: 'e2e test / long-term-test' on: - schedule: - # Runs at 23:00 UTC (7:00 AM Beijing) every day - - cron: '0 23 * * *' pull_request: types: [ labeled ] @@ -43,7 +40,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [main, v0.9.1] + vllm_version: [v0.9.1] name: vLLM Ascend long term test runs-on: ${{ matrix.os }} container: @@ -97,13 +94,17 @@ jobs: - name: Run vllm-project/vllm-ascend long term test run: | if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then - # spec decode test - VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py + # v0 spec decode test + # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py # it needs a clean process + # pytest -sv tests/long_term/spec_decode_v0 --ignore=tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py + # v1 spec decode test + # TODO: revert me when test_v1_mtp_correctness.py is fixed + VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed - # VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py - VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process - pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py - pytest -sv tests/e2e/long_term/test_accuracy.py + # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_spec_decode.py + # accuracy test single card + pytest -sv tests/long_term/test_accuracy.py else - VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py + # accuracy test multi card + VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py fi diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml index 932b3e59b3..015c88c170 100644 --- a/.github/workflows/vllm_ascend_test_pd.yaml +++ b/.github/workflows/vllm_ascend_test_pd.yaml @@ -17,9 +17,6 @@ name: 'e2e test / pd-disaggregation' on: - schedule: - # Runs at 23:00 UTC (7:00 AM Beijing) every day - - cron: '0 23 * * *' pull_request: types: [ labeled ] @@ -41,7 +38,7 @@ jobs: if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }} strategy: matrix: - vllm_verison: [main, v0.9.1] + vllm_verison: [v0.9.1] name: vLLM Ascend prefilling decoding disaggregation test runs-on: linux-arm64-npu-static-8 @@ -106,3 +103,7 @@ jobs: - name: Run vllm-project/vllm-ascend PD Disaggregation test run: | pytest -sv tests/e2e/pd_disaggreate/test_pd_e2e.py + + - name: Run vllm-project/vllm-ascend PD Disaggregation edge test + run: | + bash tests/e2e/pd_disaggreate/run_edge_case_test.sh \ No newline at end of file diff --git a/README.md b/README.md index 7d0966c8d4..bdc6a1bb00 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l - Software: * Python >= 3.9, < 3.12 * CANN >= 8.1.RC1 - * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1.dev20250528 + * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1.dev20250619 * vLLM (the same version as vllm-ascend) ## Getting Started diff --git a/README.zh.md b/README.zh.md index 2d2062a8b4..afe2c76143 100644 --- a/README.zh.md +++ b/README.zh.md @@ -39,7 +39,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP - 软件: * Python >= 3.9, < 3.12 * CANN >= 8.1.RC1 - * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1.dev20250528 + * PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1.dev20250619 * vLLM (与vllm-ascend版本一致) ## 开始使用 diff --git a/docs/requirements-test.txt b/docs/requirements-test.txt index c63a6da9cb..b8c6fe990f 100644 --- a/docs/requirements-test.txt +++ b/docs/requirements-test.txt @@ -1,2 +1,2 @@ pytest-asyncio -pytest-mock + diff --git a/docs/source/community/contributors.md b/docs/source/community/contributors.md index 16ea600d65..d61259de84 100644 --- a/docs/source/community/contributors.md +++ b/docs/source/community/contributors.md @@ -7,7 +7,6 @@ | Xiyuan Wang| [@wangxiyuan](https://github.com/wangxiyuan) | 2025/01 | | Yikun Jiang| [@Yikun](https://github.com/Yikun) | 2025/02 | | Yi Gan| [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/02 | -| Shoujian Zheng| [@jianzs](https://github.com/jianzs) | 2025/06 | ## Contributors diff --git a/docs/source/community/user_stories/index.md b/docs/source/community/user_stories/index.md deleted file mode 100644 index 1dc1e56e76..0000000000 --- a/docs/source/community/user_stories/index.md +++ /dev/null @@ -1,19 +0,0 @@ -# User Stories - -Read case studies on how users and developers solves real, everyday problems with vLLM Ascend - -- [LLaMA-Factory](./llamafactory.md) is an easy-to-use and efficient platform for training and fine-tuning large language models, it supports vLLM Ascend to speed up inference since [LLaMA-Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739), gain 2x performance enhancement of inference. - -- [Huggingface/trl](https://github.com/huggingface/trl) is a cutting-edge library designed for post-training foundation models using advanced techniques like SFT, PPO and DPO, it uses vLLM Ascend since [v0.17.0](https://github.com/huggingface/trl/releases/tag/v0.17.0) to support RLHF on Ascend NPU. - -- [MindIE Turbo](https://pypi.org/project/mindie-turbo) is an LLM inference engine acceleration plug-in library developed by Huawei on Ascend hardware, which includes self-developed large language model optimization algorithms and optimizations related to the inference engine framework. It supports vLLM Ascend since [2.0rc1](https://www.hiascend.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev/mindie-turbo-0001.html). - -- [GPUStack](https://github.com/gpustack/gpustack) is an open-source GPU cluster manager for running AI models. It supports vLLM Ascend since [v0.6.2](https://github.com/gpustack/gpustack/releases/tag/v0.6.2), see more GPUStack performance evaluation info on [link](https://mp.weixin.qq.com/s/pkytJVjcH9_OnffnsFGaew). - -- [verl](https://github.com/volcengine/verl) is a flexible, efficient and production-ready RL training library for large language models (LLMs), uses vLLM Ascend since [v0.4.0](https://github.com/volcengine/verl/releases/tag/v0.4.0), see more info on [verl x Ascend Quickstart](https://verl.readthedocs.io/en/latest/ascend_tutorial/ascend_quick_start.html). - -:::{toctree} -:caption: More details -:maxdepth: 1 -llamafactory -::: diff --git a/docs/source/community/user_stories/llamafactory.md b/docs/source/community/user_stories/llamafactory.md deleted file mode 100644 index bb95990eb0..0000000000 --- a/docs/source/community/user_stories/llamafactory.md +++ /dev/null @@ -1,19 +0,0 @@ -# LLaMA-Factory - -**About / Introduction** - -[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) is an easy-to-use and efficient platform for training and fine-tuning large language models. With LLaMA-Factory, you can fine-tune hundreds of pre-trained models locally without writing any code. - -LLaMA-Facotory users need to evaluate and inference the model after fine-tuning the model. - -**The Business Challenge** - -LLaMA-Factory used transformers to perform inference on Ascend NPU, but the speed was slow. - -**Solving Challenges and Benefits with vLLM Ascend** - -With the joint efforts of LLaMA-Factory and vLLM Ascend ([LLaMA-Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739)), the performance of LLaMA-Factory in the model inference stage has been significantly improved. According to the test results, the inference speed of LLaMA-Factory has been increased to 2x compared to the transformers version. - -**Learn more** - -See more about LLaMA-Factory and how it uses vLLM Ascend for inference on the Ascend NPU in the following documentation: [LLaMA-Factory Ascend NPU Inference](https://llamafactory.readthedocs.io/en/latest/advanced/npu_inference.html). diff --git a/docs/source/faqs.md b/docs/source/faqs.md index 1de3befb2d..be6d689eff 100644 --- a/docs/source/faqs.md +++ b/docs/source/faqs.md @@ -114,7 +114,7 @@ In scenarios where NPUs have limited HBM (High Bandwidth Memory) capacity, dynam - **Configure `PYTORCH_NPU_ALLOC_CONF`**: Set this environment variable to optimize NPU memory management. For example, you can `export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True` to enable virtual memory feature to mitigate memory fragmentation caused by frequent dynamic memory size adjustments during runtime, see more note in: [PYTORCH_NPU_ALLOC_CONF](https://www.hiascend.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html). -### 15. Failed to enable NPU graph mode when running DeepSeek? +### 16. Failed to enable NPU graph mode when running DeepSeek? You may encounter the following error if running DeepSeek with NPU graph mode enabled. The allowed number of queries per kv when enabling both MLA and Graph mode only support {32, 64, 128}, **Thus this is not supported for DeepSeek-V2-Lite**, as it only has 16 attention heads. The NPU graph mode support on DeepSeek-V2-Lite will be done in the future. And if you're using DeepSeek-V3 or DeepSeek-R1, please make sure after the tensor parallel split, num_heads / num_kv_heads in {32, 64, 128}. @@ -123,3 +123,6 @@ And if you're using DeepSeek-V3 or DeepSeek-R1, please make sure after the tenso [rank0]: RuntimeError: EZ9999: Inner Error! [rank0]: EZ9999: [PID: 62938] 2025-05-27-06:52:12.455.807 numHeads / numKvHeads = 8, MLA only support {32, 64, 128}.[FUNC:CheckMlaAttrs][FILE:incre_flash_attention_tiling_check.cc][LINE:1218] ``` + +### 17. Failed to reinstall vllm-ascend from source after uninstalling vllm-ascend? +You may encounter the problem of C compilation failure when reinstalling vllm-ascend from source using pip. If the installation fails, it is recommended to use `python setup.py install` to install, or use `python setup.py clean` to clear the cache. diff --git a/docs/source/index.md b/docs/source/index.md index e9b103b2c3..5f421b4679 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -66,5 +66,11 @@ developer_guide/evaluation/index :maxdepth: 1 community/governance community/contributors -community/user_stories/index +::: + +% User stories about vLLM Ascend project +:::{toctree} +:caption: User Story +:maxdepth: 1 +user_stories/index ::: diff --git a/docs/source/installation.md b/docs/source/installation.md index c290f7e5f7..b7eb611a03 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -12,7 +12,7 @@ This document describes how to install vllm-ascend manually. | Software | Supported version | Note | |---------------|----------------------------------|-------------------------------------------| | CANN | >= 8.1.RC1 | Required for vllm-ascend and torch-npu | - | torch-npu | >= 2.5.1.post1.dev20250528 | Required for vllm-ascend | + | torch-npu | >= 2.5.1.post1.dev20250619 | Required for vllm-ascend | | torch | >= 2.5.1 | Required for torch-npu and vllm | You have 2 way to install: @@ -246,8 +246,7 @@ for output in outputs: Then run: ```bash -# Try `export VLLM_USE_MODELSCOPE=true` and `pip install modelscope` -# to speed up download if huggingface is not reachable. +# export VLLM_USE_MODELSCOPE=true to speed up download if huggingface is not reachable. python example.py ``` diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index d4756ef5e1..2a0194209b 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -28,7 +28,6 @@ The following table lists the additional configuration options available in vLLM |-------------------------------| ---- |------|-----------------------------------------------------------------------------------------------| | `torchair_graph_config` | dict | `{}` | The config options for torchair graph mode | | `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler | -| `expert_tensor_parallel_size` | str | `0` | Expert tensor parallel size the model to use. | | `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf case. | | `expert_map_path` | str | None | When using expert load balancing for the MOE model, an expert map path needs to be passed in. | | `chunked_prefill_for_mla` | bool | `False` | Whether to enable the fused operator-like chunked_prefill. | diff --git a/docs/source/user_stories/example.md b/docs/source/user_stories/example.md new file mode 100644 index 0000000000..ebbcf56d7b --- /dev/null +++ b/docs/source/user_stories/example.md @@ -0,0 +1,15 @@ +# xxx project uses Ascend vLLM, gain 200% performance enhancement of inference. + +## About / Introduction +Draft content + +## The Business Challenge +Our goal is to ... + +## Solving challenges with vLLM Ascend +vLLM Ascend helped us ... + +## Benefits using vLLM Ascend + +## Learn more +more info about this case diff --git a/docs/source/user_stories/index.md b/docs/source/user_stories/index.md new file mode 100644 index 0000000000..6de8b21577 --- /dev/null +++ b/docs/source/user_stories/index.md @@ -0,0 +1,22 @@ +# vLLM Ascend User Stories + +Read case studies on how users and developers solves real, everyday problems with vLLM Ascend + +:::{card} Example user story +:link: ./example +:link-type: doc + +xxx project uses Ascend vLLM, gain 200% performance enhancement of inference. + ++++ + +**Tags**: vLLM, Ascend, Inference + +::: + +:::{toctree} +:caption: Deployment +:maxdepth: 1 +:hidden: +example +::: diff --git a/examples/disaggregate_prefill_v1/README.md b/examples/disaggregate_prefill_v1/README.md new file mode 100644 index 0000000000..544d5ba020 --- /dev/null +++ b/examples/disaggregate_prefill_v1/README.md @@ -0,0 +1,234 @@ +# Disaggregated Prefill-Decode Deployment Guide + +## Overview +This demo document provides instructions for running a disaggregated vLLM-ascend service with separate prefill and decode stages across 4 nodes, uses 16 Ascend NPUs for two prefill nodes (P1/P2) and 16 Ascend NPUS for two decode nodes (D1/D2). + +## Prerequisites +- Ascend NPU environment with vLLM 0.9.1 installed +- Network interfaces configured for distributed communication (eg: eth0) +- Model weights located at `/data01/deepseek_r1_w8a8_zhw` + +## Rank table generation +The rank table is a JSON file that specifies the mapping of Ascend NPU ranks to nodes. The following command generates a rank table for all nodes with 16 cards prefill and 16 cards decode: + +Run the following command on every node to generate the rank table: +```shell +cd vllm-ascend/examples/disaggregate_prefill_v1/ +bash gen_ranktable.sh --ips 172.19.32.175 172.19.241.49 172.19.123.51 172.19.190.36 \ + --npus-per-node 8 --network-card-name enp189s0f0 --prefill-device-cnt 16 --decode-device-cnt 16 +``` +Rank table will generated at `/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json` + +## Start disaggregated vLLM-ascend service +Execution Sequence +- 4 configured node ip are: 172.19.32.175 172.19.241.49 172.19.123.51 172.19.190.36 +- Start Prefill on Node 1 (P1) +- Start Prefill on Node 2 (P2) +- Start Decode on Node 1 (D1) +- Start Decode on Node 2 (D2) +- Start proxy server on Node1 + +* Run prefill server P1 on first node +```shell +export HCCL_IF_IP=172.19.32.175 # node ip +export GLOO_SOCKET_IFNAME="eth0" # network card name +export TP_SOCKET_IFNAME="eth0" +export HCCL_SOCKET_IFNAME="eth0" +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +vllm serve /data01/deepseek_r1_w8a8_zhw \ + --host 0.0.0.0 \ + --port 20002 \ + --data-parallel-size 2 \ + --data-parallel-size-local 1 \ + --api-server-count 2 \ + --data-parallel-address 172.19.32.175 \ + --data-parallel-rpc-port 13356 \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --no-enable-prefix-caching \ + --seed 1024 \ + --served-model-name deepseek \ + --max-model-len 6144 \ + --max-num-batched-tokens 6144 \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_producer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' \ + --additional-config \ + '{"torchair_graph_config": {"enabled":false, "enable_multistream_shared_expert":false}, "ascend_scheduler_config":{"enabled":true, "enable_chunked_prefill":false}}' +``` + +* Run prefill server P2 on second node +```shell +export HCCL_IF_IP=172.19.241.49 +export GLOO_SOCKET_IFNAME="eth0" +export TP_SOCKET_IFNAME="eth0" +export HCCL_SOCKET_IFNAME="eth0" +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +vllm serve /data01/deepseek_r1_w8a8_zhw \ + --host 0.0.0.0 \ + --port 20002 \ + --headless \ + --data-parallel-size 2 \ + --data-parallel-start-rank 1 \ + --data-parallel-size-local 1 \ + --data-parallel-address 172.19.32.175 \ + --data-parallel-rpc-port 13356 \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --no-enable-prefix-caching \ + --seed 1024 \ + --served-model-name deepseek \ + --max-model-len 6144 \ + --max-num-batched-tokens 6144 \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector": "LLMDataDistCMgrConnector", \ + "kv_buffer_device": "npu", + "kv_role": "kv_producer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' \ + --additional-config \ + '{"torchair_graph_config": {"enabled":false, "enable_multistream_shared_expert":false}, "ascend_scheduler_config":{"enabled":true, "enable_chunked_prefill":false}}' +``` + +* Run decode server d1 on third node +```shell +export HCCL_IF_IP=172.19.123.51 +export GLOO_SOCKET_IFNAME="eth0" +export TP_SOCKET_IFNAME="eth0" +export HCCL_SOCKET_IFNAME="eth0" +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +vllm serve /data01/deepseek_r1_w8a8_zhw \ + --host 0.0.0.0 \ + --port 20002 \ + --data-parallel-size 2 \ + --data-parallel-size-local 1 \ + --api-server-count 2 \ + --data-parallel-address 172.19.123.51 \ + --data-parallel-rpc-port 13356 \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --no-enable-prefix-caching \ + --seed 1024 \ + --served-model-name deepseek \ + --max-model-len 6144 \ + --max-num-batched-tokens 6144 \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_consumer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' \ + --additional-config \ + '{"torchair_graph_config": {"enabled":false, "enable_multistream_shared_expert":false}, "ascend_scheduler_config":{"enabled":true, "enable_chunked_prefill":false}}' +``` + +* Run decode server d2 on last node +```shell +export HCCL_IF_IP=172.19.190.36 +export GLOO_SOCKET_IFNAME="eth0" +export TP_SOCKET_IFNAME="eth0" +export HCCL_SOCKET_IFNAME="eth0" +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +vllm serve /data01/deepseek_r1_w8a8_zhw \ + --host 0.0.0.0 \ + --port 20002 \ + --headless \ + --data-parallel-size 2 \ + --data-parallel-start-rank 1 \ + --data-parallel-size-local 1 \ + --data-parallel-address 172.19.123.51 \ + --data-parallel-rpc-port 13356 \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --no-enable-prefix-caching \ + --seed 1024 \ + --served-model-name deepseek \ + --max-model-len 6144 \ + --max-num-batched-tokens 6144 \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_consumer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' \ + --additional-config \ + '{"torchair_graph_config": {"enabled":false, "enable_multistream_shared_expert":false}, "ascend_scheduler_config":{"enabled":true, "enable_chunked_prefill":false}}' +``` + +* Run proxy server on the first node +```shell +cd /vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1 +python toy_proxy_server.py --host 172.19.32.175 --port 1025 --prefiller-hosts 172.19.241.49 --prefiller-port 20002 --decoder-hosts 172.19.123.51 --decoder-ports 20002 +``` + +* Verification +Check service health using the proxy server endpoint: +```shell +curl http://localhost:1025/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "deepseek", + "prompt": "Who are you?", + "max_tokens": 100, + "temperature": 0 + }' +``` + +* Performance +Test performance with vllm benchmark +```shell +cd /vllm-workspace/vllm/benchmarks +python3 benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --random-input-len 4096 \ + --random-output-len 1536 \ + --num-prompts 256 \ + --ignore-eos \ + --model deepseek \ + --tokenizer /data01/deepseek_r1_w8a8_zhw \ + --host localhost \ + --port 8000 \ + --endpoint /v1/completions \ + --max-concurrency 4 \ + --request-rate 4 +``` \ No newline at end of file diff --git a/examples/disaggregate_prefill_v1/gen_ranktable.py b/examples/disaggregate_prefill_v1/gen_ranktable.py new file mode 100644 index 0000000000..d170f3ba06 --- /dev/null +++ b/examples/disaggregate_prefill_v1/gen_ranktable.py @@ -0,0 +1,120 @@ +import argparse +import json +import os + +import torch.distributed as dist + +from vllm_ascend.soc_info import NPUSocInfo + +parser = argparse.ArgumentParser( + description="Arguments of rank table generator", ) +parser.add_argument("--local-host", type=str, required=True, help="local ip") +parser.add_argument("--prefill-device-cnt", + type=int, + required=True, + help="number of prefill devices") +parser.add_argument("--decode-device-cnt", + type=int, + required=True, + help="number of decode devices") +args = parser.parse_args() +local_host = args.local_host +prefill_device_cnt = args.prefill_device_cnt +decode_device_cnt = args.decode_device_cnt + +print("enter py") + +hccn_tool_path = os.environ.get("HCCN_TOOL_PATH", + "/usr/local/Ascend/driver/tools/hccn_tool") +master_addr = os.environ.get("MASTER_ADDR") +master_port = os.environ.get("MASTER_PORT") +rank = os.environ.get("RANK") +local_rank = os.environ.get("LOCAL_RANK") +# This variable is set by torchrun, +# and is different from WORLD_SIZE in gen_rank_table.sh. +world_size = os.environ.get("WORLD_SIZE") +soc_info = NPUSocInfo() + + +def get_cmd_stdout(cmd): + import subprocess + return subprocess.run(cmd, capture_output=True, + shell=True).stdout.decode("utf-8").strip() + + +print(f"local_host: {local_host}") +print("gen ranktable.json") + +num_cards = get_cmd_stdout("npu-smi info -l | grep \"Total Count\"").split( + ":")[1].strip() +num_cards = int(num_cards) +chips_per_card = get_cmd_stdout("npu-smi info -l | grep \"Chip Count\"").split( + "\n")[0].split(":")[1].strip() +chips_per_card = int(chips_per_card) + +# generate local device list for local rank 0, and gather it to all ranks +local_device_list: list[dict[str, str]] = list() +if local_rank == "0": + super_pod_id = "0" + for card_id in range(num_cards): + for chip_id in range(chips_per_card): + device_id = card_id * chips_per_card + chip_id + if soc_info.is_a3: + device_ip = get_cmd_stdout( + f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr" + ).split(":")[1].strip() + super_device_id = get_cmd_stdout( + f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID" + ).split(":")[1].strip() + super_pod_id = get_cmd_stdout( + f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\"" + ).split(":")[1].strip() + else: + device_ip = get_cmd_stdout( + f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr" + ).split(":")[1].strip() + + device_info = { + "server_id": local_host, + "device_id": str(device_id), + "device_ip": str(device_ip), + } + if soc_info.is_a3: + device_info.update({ + "super_pod_id": str(super_pod_id), + "super_device_id": str(super_device_id) + }) + local_device_list.append(device_info) + +dist.init_process_group(backend=dist.Backend.GLOO) +global_device_list = [None] * dist.get_world_size() +dist.all_gather_object(global_device_list, local_device_list) +global_device_list = [ + device_info for device_list in global_device_list + for device_info in device_list # type: ignore[attr-defined] +] +cnt = 1 +for device_info in global_device_list: # type: ignore[assignment] + device_info["cluster_id"] = str(cnt) + cnt += 1 +assert (prefill_device_cnt + decode_device_cnt) <= len(global_device_list), \ +"prefill_device_cnt + decode_device_cnt must be less than or equal to number of all devices in cluster" +ranktable = { + "version": + "1.2", + "server_count": + str(world_size), + "prefill_device_list": + global_device_list[:prefill_device_cnt], + "decode_device_list": + global_device_list[prefill_device_cnt:prefill_device_cnt + + decode_device_cnt], + "status": + "completed" +} + +if local_rank == '0': + with open("ranktable.json", "w") as f: + json.dump(ranktable, f, indent=4) + + print("gen ranktable.json done") diff --git a/examples/disaggregate_prefill_v1/gen_ranktable.sh b/examples/disaggregate_prefill_v1/gen_ranktable.sh new file mode 100644 index 0000000000..33d4a32e8d --- /dev/null +++ b/examples/disaggregate_prefill_v1/gen_ranktable.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +source /usr/local/Ascend/ascend-toolkit/set_env.sh +export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH} + +NPUS_PER_NODE=8 +while [[ $# -gt 0 ]]; do + case "$1" in + --ips) + shift + while [[ $# -gt 0 && ! "$1" == --* ]]; do + IPs+=("$1") + shift + done + ;; + --npus-per-node) + shift + NPUS_PER_NODE="$1" + shift + ;; + --network-card-name) + shift + NETWORK_CARD_NAME="$1" + shift + ;; + --prefill-device-cnt) + shift + PREFILL_DEVICE_CNT="$1" + shift + ;; + --decode-device-cnt) + shift + DECODE_DEVICE_CNT="$1" + shift + ;; + esac +done +LOCAL_HOSTS=($(hostname -I)) +LOCAL_HOST="127.0.0.1" +MASTER_ADDR=${IPs[0]} +MASTER_PORT=6657 +NNODES=${#IPs[@]} +NODE_RANK="8" +for i in "${!IPs[@]}"; do + ip="${IPs[$i]}" + for local_host in "${LOCAL_HOSTS[@]}"; do + if [[ "$local_host" == "$ip" ]]; then + LOCAL_HOST=$local_host + NODE_RANK=$i + break 2 + fi + done +done + +if [[ $NODE_RANK == "" ]];then + echo "[Error] para \"NODE_RANK\" must be defined" + exit 1 +fi + +WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES)) +RANKSTART=`expr $NPUS_PER_NODE \* $NODE_RANK` + +echo "========>param:" +echo "LOCAL_HOST": $LOCAL_HOST +echo "WORLD_SIZE: " $WORLD_SIZE +echo "RANKSTART": $RANKSTART +echo "NNODES": $NNODES +echo "NODE_RANK": $NODE_RANK +echo "===============" + +if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then + GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \ + --nproc_per_node 1 \ + --nnodes ${NNODES} \ + --node_rank ${NODE_RANK} \ + --master_addr ${MASTER_ADDR} \ + --master_port ${MASTER_PORT} \ + gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT +fi \ No newline at end of file diff --git a/examples/disaggregate_prefill_v1/run_server.sh b/examples/disaggregate_prefill_v1/run_server.sh new file mode 100644 index 0000000000..37cf6d3aee --- /dev/null +++ b/examples/disaggregate_prefill_v1/run_server.sh @@ -0,0 +1,32 @@ +export HCCL_IF_IP=141.61.39.117 +export GLOO_SOCKET_IFNAME="enp48s3u1u1" +export TP_SOCKET_IFNAME="enp48s3u1u1" +export HCCL_SOCKET_IFNAME="enp48s3u1u1" +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=path-to-rank-table + +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 + +export VLLM_USE_V1=1 + +vllm serve model_path \ + --host 0.0.0.0 \ + --port 20002 \ + --tensor-parallel-size 1\ + --seed 1024 \ + --served-model-name dsv3 \ + --max-model-len 2000 \ + ---max-num-batched-tokens 2000 \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_consumer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": 0, + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_connector_v1_a3" + }' \ + --additional-config \ + '{"enable_graph_mode": "True"}'\ diff --git a/examples/disaggregate_prefill_v1/toy_proxy_server.py b/examples/disaggregate_prefill_v1/toy_proxy_server.py new file mode 100644 index 0000000000..4478073f74 --- /dev/null +++ b/examples/disaggregate_prefill_v1/toy_proxy_server.py @@ -0,0 +1,261 @@ +# Adapted from https://github.com/vllm-project/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py + +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import itertools +import os +import uuid +from contextlib import asynccontextmanager + +import httpx +from fastapi import FastAPI, Request +from fastapi.responses import StreamingResponse +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """ + Lifespan context manager to handle startup and shutdown events. + """ + # Startup: Initialize client pools for prefiller and decoder services + app.state.prefill_clients = [] + app.state.decode_clients = [] + + # Create prefill clients + for i, (host, port) in enumerate(global_args.prefiller_instances): + prefiller_base_url = f'http://{host}:{port}/v1' + app.state.prefill_clients.append({ + 'client': + httpx.AsyncClient(timeout=None, base_url=prefiller_base_url), + 'host': + host, + 'port': + port, + 'id': + i + }) + + # Create decode clients + for i, (host, port) in enumerate(global_args.decoder_instances): + decoder_base_url = f'http://{host}:{port}/v1' + app.state.decode_clients.append({ + 'client': + httpx.AsyncClient(timeout=None, base_url=decoder_base_url), + 'host': + host, + 'port': + port, + 'id': + i + }) + + # Initialize round-robin iterators + app.state.prefill_iterator = itertools.cycle( + range(len(app.state.prefill_clients))) + app.state.decode_iterator = itertools.cycle( + range(len(app.state.decode_clients))) + + print(f"Initialized {len(app.state.prefill_clients)} prefill clients " + f"and {len(app.state.decode_clients)} decode clients.") + + yield + + # Shutdown: Close all clients + for client_info in app.state.prefill_clients: + await client_info['client'].aclose() + + for client_info in app.state.decode_clients: + await client_info['client'].aclose() + + +# Update FastAPI app initialization to use lifespan +app = FastAPI(lifespan=lifespan) + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--host", type=str, default="localhost") + + # For prefiller instances + parser.add_argument("--prefiller-hosts", + "--prefiller-host", + type=str, + nargs="+", + default=["localhost"]) + parser.add_argument("--prefiller-ports", + "--prefiller-port", + type=int, + nargs="+", + default=[8100]) + + # For decoder instances + parser.add_argument("--decoder-hosts", + "--decoder-host", + type=str, + nargs="+", + default=["localhost"]) + parser.add_argument("--decoder-ports", + "--decoder-port", + type=int, + nargs="+", + default=[8200]) + + args = parser.parse_args() + + # Validate and pair hosts with ports + if len(args.prefiller_hosts) != len(args.prefiller_ports): + raise ValueError( + "Number of prefiller hosts must match number of prefiller ports") + + if len(args.decoder_hosts) != len(args.decoder_ports): + raise ValueError( + "Number of decoder hosts must match number of decoder ports") + + # Create tuples of (host, port) for each service type + args.prefiller_instances = list( + zip(args.prefiller_hosts, args.prefiller_ports)) + args.decoder_instances = list(zip(args.decoder_hosts, args.decoder_ports)) + + return args + + +def get_next_client(app, service_type: str): + """ + Get the next client in round-robin fashion. + + Args: + app: The FastAPI app instance + service_type: Either 'prefill' or 'decode' + + Returns: + The next client to use + """ + if service_type == 'prefill': + client_idx = next(app.state.prefill_iterator) + return app.state.prefill_clients[client_idx] + elif service_type == 'decode': + client_idx = next(app.state.decode_iterator) + return app.state.decode_clients[client_idx] + else: + raise ValueError(f"Unknown service type: {service_type}") + + +async def send_request_to_service(client_info: dict, endpoint: str, + req_data: dict, request_id: str): + """ + Send a request to a service using a client from the pool. + """ + req_data = req_data.copy() + req_data['kv_transfer_params'] = { + "do_remote_decode": True, + "do_remote_prefill": False, + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": None, + "remote_port": None + } + req_data["stream"] = False + req_data["max_tokens"] = 1 + if "stream_options" in req_data: + del req_data["stream_options"] + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id + } + + response = await client_info['client'].post(endpoint, + json=req_data, + headers=headers) + response.raise_for_status() + + return response + + +async def stream_service_response(client_info: dict, endpoint: str, + req_data: dict, request_id: str): + """ + Asynchronously stream response from a service using a client from the pool. + """ + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id + } + + async with client_info['client'].stream("POST", + endpoint, + json=req_data, + headers=headers) as response: + response.raise_for_status() + async for chunk in response.aiter_bytes(): + yield chunk + + +@app.post("/v1/completions") +async def handle_completions(request: Request): + try: + req_data = await request.json() + request_id = str(uuid.uuid4()) + + # Get the next prefill client in round-robin fashion + prefill_client_info = get_next_client(request.app, 'prefill') + + # Send request to prefill service + response = await send_request_to_service(prefill_client_info, + "/completions", req_data, + request_id) + + # Extract the needed fields + response_json = response.json() + kv_transfer_params = response_json.get('kv_transfer_params', {}) + if kv_transfer_params: + req_data["kv_transfer_params"] = kv_transfer_params + + # Get the next decode client in round-robin fashion + decode_client_info = get_next_client(request.app, 'decode') + + logger.debug("Using %s %s", prefill_client_info, decode_client_info) + + # Stream response from decode service + async def generate_stream(): + async for chunk in stream_service_response(decode_client_info, + "/completions", + req_data, + request_id=request_id): + yield chunk + + return StreamingResponse(generate_stream(), + media_type="application/json") + + except Exception as e: + import sys + import traceback + exc_info = sys.exc_info() + print("Error occurred in disagg prefill proxy server" + " - completions endpoint") + print(e) + print("".join(traceback.format_exception(*exc_info))) + raise + + +@app.get("/healthcheck") +async def healthcheck(): + """Simple endpoint to check if the server is running.""" + return { + "status": "ok", + "prefill_instances": len(app.state.prefill_clients), + "decode_instances": len(app.state.decode_clients) + } + + +if __name__ == '__main__': + global global_args + global_args = parse_args() + + import uvicorn + uvicorn.run(app, host=global_args.host, port=global_args.port) diff --git a/examples/dp_offline/data_parallel.py b/examples/dp_offline/data_parallel.py index b06c52d8c5..37a14d5f7b 100644 --- a/examples/dp_offline/data_parallel.py +++ b/examples/dp_offline/data_parallel.py @@ -1,85 +1,226 @@ -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# Adapted from vllm-project/vllm/examples/offline_inference/data_parallel.py # SPDX-License-Identifier: Apache-2.0 -# usage: -# python examples/offline_inference_data_parallel.py -# we need to have a launcher to create multiple data parallel -# ranks. And each rank will create a vLLM instance to process its own prompts. +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Usage: +Single node: + python examples/offline_inference/data_parallel.py \ + --model="ibm-research/PowerMoE-3b" \ + --dp-size=2 \ + --tp-size=2 + +Multi-node: + Node 0 (assume the node has ip of 10.99.48.128): + python examples/offline_inference/data_parallel.py \ + --model="ibm-research/PowerMoE-3b" \ + --dp-size=2 \ + --tp-size=2 \ + --node-size=2 \ + --node-rank=0 \ + --master-addr=10.99.48.128 \ + --master-port=13345 + Node 1: + python examples/offline_inference/data_parallel.py \ + --model="ibm-research/PowerMoE-3b" \ + --dp-size=2 \ + --tp-size=2 \ + --node-size=2 \ + --node-rank=1 \ + --master-addr=10.99.48.128 \ + --master-port=13345 +""" -import gc import os +from time import sleep + +from vllm import LLM, SamplingParams +from vllm.utils import get_open_port + + +def parse_args(): + import argparse + parser = argparse.ArgumentParser(description="Data Parallel Inference") + parser.add_argument( + "--model", + type=str, + default="ibm-research/PowerMoE-3b", + help="Model name or path", + ) + parser.add_argument("--dp-size", + type=int, + default=2, + help="Data parallel size") + parser.add_argument("--tp-size", + type=int, + default=2, + help="Tensor parallel size") + parser.add_argument("--node-size", + type=int, + default=1, + help="Total number of nodes") + parser.add_argument("--node-rank", + type=int, + default=0, + help="Rank of the current node") + parser.add_argument("--master-addr", + type=str, + default="", + help="Master node IP address") + parser.add_argument("--master-port", + type=int, + default=0, + help="Master node port") + parser.add_argument("--enforce-eager", + action="store_true", + help="Enforce eager mode execution.") + parser.add_argument("--trust-remote-code", + action="store_true", + help="Trust remote code.") + return parser.parse_args() -def main(): - dp_rank = int(os.environ['RANK']) - local_rank = int(os.environ['LOCAL_RANK']) - dp_size = int(os.environ['WORLD_SIZE']) - master_addr = os.environ['MASTER_ADDR'] - master_port = os.environ['MASTER_PORT'] - tp_size = 1 - etp_size = 1 - os.environ["VLLM_DP_RANK"] = str(dp_rank) +def main( + model, + dp_size, + local_dp_rank, + global_dp_rank, + dp_master_ip, + dp_master_port, + GPUs_per_dp_rank, + enforce_eager, + trust_remote_code, +): + os.environ["VLLM_DP_RANK"] = str(global_dp_rank) + os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) os.environ["VLLM_DP_SIZE"] = str(dp_size) - os.environ["VLLM_DP_MASTER_IP"] = master_addr - os.environ["VLLM_DP_MASTER_PORT"] = master_port - os.environ["ASCEND_RT_VISIBLE_DEVICES"] = ",".join( - str(i) - for i in range(local_rank * tp_size, (local_rank + 1) * tp_size)) + os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip + os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port) - import torch - from vllm import LLM, SamplingParams - from vllm.distributed.parallel_state import ( - destroy_distributed_environment, destroy_model_parallel) + # CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the + # engine processes. + # Sample prompts. prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", - ] * 4 + ] * 100 - promts_per_rank = len(prompts) // dp_size - start = dp_rank * promts_per_rank - end = start + promts_per_rank - prompts = prompts[start:end] + # with DP, each rank should process different prompts. + # usually all the DP ranks process a full dataset, + # and each rank processes a different part of the dataset. + floor = len(prompts) // dp_size + remainder = len(prompts) % dp_size + + # Distribute prompts into even groups. + def start(rank): + return rank * floor + min(rank, remainder) + + prompts = prompts[start(global_dp_rank):start(global_dp_rank + 1)] if len(prompts) == 0: + # if any rank has no prompts to process, + # we need to set a placeholder prompt prompts = ["Placeholder"] - print(f"DP rank {dp_rank} needs to process {len(prompts)} prompts") - num_seqs = len(prompts) + print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts") + + # Create a sampling params object. + # since we are doing data parallel, every rank can have different + # sampling params. here we set different max_tokens for different + # ranks for demonstration. + sampling_params = SamplingParams( + temperature=0.0, + max_tokens=32, + ) - sampling_params = SamplingParams(temperature=0.8, - top_p=0.95, - max_tokens=4, - min_tokens=4) # Create an LLM. - llm = LLM(model="deepseek-ai/DeepSeek-V2-Lite-Chat", - tensor_parallel_size=tp_size, - trust_remote_code=True, - max_model_len=4096, - max_num_seqs=num_seqs, - additional_config={ - 'expert_tensor_parallel_size': etp_size, - 'torchair_graph_config': { - 'enabled': False, - }, - }) + llm = LLM( + model=model, + tensor_parallel_size=GPUs_per_dp_rank, + enforce_eager=enforce_eager, + trust_remote_code=trust_remote_code, + distributed_executor_backend="mp", + max_model_len=2048, + max_num_batched_tokens=2048, + max_num_seqs=16, + enable_prefix_caching=False, + enable_expert_parallel=True, + gpu_memory_utilization=0.9, + additional_config={ + "ascend_scheduler_config": { + "enabled": True + }, + "torchair_graph_config": { + "enabled": False, + "enable_multistream_shared_expert": False + }, + }, + ) outputs = llm.generate(prompts, sampling_params) - for output in outputs: + # Print the outputs. + for i, output in enumerate(outputs): + if i >= 5: + # print only 5 outputs + break prompt = output.prompt generated_text = output.outputs[0].text - print(f"DP rank {dp_rank}, Prompt: {prompt!r}, " + print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, " f"Generated text: {generated_text!r}") - del llm - destroy_model_parallel() - destroy_distributed_environment() - gc.collect() - torch.npu.empty_cache() + # Give engines time to pause their processing loops before exiting. + sleep(1) if __name__ == "__main__": - main() + args = parse_args() + + dp_size = args.dp_size + tp_size = args.tp_size + node_size = args.node_size + node_rank = args.node_rank + + if node_size == 1: + dp_master_ip = "127.0.0.1" + dp_master_port = get_open_port() + else: + dp_master_ip = args.master_addr + dp_master_port = args.master_port + + assert dp_size % node_size == 0, "dp_size should be divisible by node_size" + dp_per_node = dp_size // node_size + + from multiprocessing import Process + + procs = [] + for local_dp_rank, global_dp_rank in enumerate( + range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)): + proc = Process( + target=main, + args=( + args.model, + dp_size, + local_dp_rank, + global_dp_rank, + dp_master_ip, + dp_master_port, + tp_size, + args.enforce_eager, + args.trust_remote_code, + ), + ) + proc.start() + procs.append(proc) + exit_code = 0 + for proc in procs: + proc.join(timeout=3000) + if proc.exitcode is None: + print( + f"Killing process {proc.pid} that didn't stop within 5 minutes." + ) + proc.kill() + exit_code = 1 + elif proc.exitcode: + exit_code = proc.exitcode + + exit(exit_code) diff --git a/examples/dp_offline/run_dp.sh b/examples/dp_offline/run_dp.sh index 405df604a4..508d966651 100644 --- a/examples/dp_offline/run_dp.sh +++ b/examples/dp_offline/run_dp.sh @@ -1,19 +1,28 @@ +rm -rf ./.torchair_cache/ +rm -rf ./dynamo_* +rm -rf /root/ascend/log/debug/plog/* + +ifname="ifname" +local_ip="local ip" +master_addr="master ip" +model_path="path to model ckpt" + export HCCL_IF_IP=${local_ip} export GLOO_SOCKET_IFNAME=${ifname} export TP_SOCKET_IFNAME=${ifname} export HCCL_SOCKET_IFNAME=${ifname} -# dp_size = node_size * dp_per_node -node_size=1 -node_rank=0 -dp_per_node=4 -master_addr=127.0.0.1 -master_port=12345 - -rm -rf ./.torchair_cache/ -rm -rf ./dynamo_* -rm -rf /root/ascend/log/debug/plog/* +export VLLM_USE_V1=1 +export ASCEND_LAUNCH_BLOCKING=0 +# export VLLM_VERSION=0.9.0 -torchrun --nproc_per_node ${dp_per_node} --nnodes ${node_size} \ - --node_rank ${node_rank} --master_addr ${master_addr} --master_port ${master_port} \ - data_parallel.py +python data_parallel.py \ + --model=${model_path} \ + --dp-size=4 \ + --tp-size=4 \ + --enforce-eager \ + --trust-remote-code \ + --node-size=1 \ + --node-rank=0 \ + --master-addr=${master_addr} \ + --master-port=13345 diff --git a/examples/offline_dualbatch_overlap_npu.py b/examples/offline_dualbatch_overlap_npu.py index d8153e38ca..dd8ee9aeb1 100644 --- a/examples/offline_dualbatch_overlap_npu.py +++ b/examples/offline_dualbatch_overlap_npu.py @@ -20,6 +20,7 @@ def main(): tensor_parallel_size=2, max_model_len=4096, trust_remote_code=True, + enable_expert_parallel=True, additional_config={ "torchair_graph_config": { "enabled": False @@ -27,7 +28,6 @@ def main(): "ascend_scheduler_config": { "enabled": True }, - "expert_tensor_parallel_size": 1 }) # Generate texts from the prompts. The output is a list of RequestOutput diff --git a/examples/run_dp_server.sh b/examples/run_dp_server.sh index e2bf4c8158..eb3cfbf510 100644 --- a/examples/run_dp_server.sh +++ b/examples/run_dp_server.sh @@ -1,3 +1,7 @@ +rm -rf ./.torchair_cache/ +rm -rf ./dynamo_* +rm -rf /root/ascend/log/debug/plog/* + export HCCL_IF_IP=2.0.0.0 export GLOO_SOCKET_IFNAME="enp189s0f0" export TP_SOCKET_IFNAME="enp189s0f0" @@ -6,25 +10,24 @@ export HCCL_SOCKET_IFNAME="enp189s0f0" export OMP_PROC_BIND=false export OMP_NUM_THREADS=100 -export VLLM_USE_V1=0 - -export ASCEND_RT_VISIBLE_DEVICES=0,1 -export VLLM_DP_SIZE=2 -export VLLM_DP_RANK=0 -export VLLM_DP_MASTER_IP="2.0.0.0" -export VLLM_DP_MASTER_PORT=40001 -export VLLM_DP_PROXY_IP="2.0.0.0" -export VLLM_DP_PROXY_PORT=30002 -export VLLM_DP_MONITOR_PORT=30003 -export VLLM_HTTP_PORT=20001 +export VLLM_USE_V1=1 +export ASCEND_LAUNCH_BLOCKING=0 vllm serve /data/weights/Qwen2.5-0.5B-Instruct \ --host 0.0.0.0 \ - --port 20001 \ - --tensor-parallel-size 1 \ - --seed 1024 \ + --port 20002 \ --served-model-name Qwen \ - --max-model-len 2000 \ - --max-num-batched-tokens 2000 \ + --data-parallel-size 4 \ + --data-parallel-size-local 4 \ + --data-parallel-address 2.0.0.0 \ + --data-parallel-rpc-port 13389 \ + --tensor-parallel-size 4 \ + --enable-expert-parallel \ + --no-enable-prefix-caching \ + --max-num-seqs 16 \ + --max-model-len 4096 \ + --max-num-batched-tokens 4096 \ + --gpu-memory-utilization 0.9 \ --trust-remote-code \ - --gpu-memory-utilization 0.9 \ \ No newline at end of file + --enforce-eager \ + --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "enable_multistream_moe":false, "use_cached_graph":false}}' diff --git a/examples/run_dp_attention_etp16.sh b/examples/run_dp_with_cached_graph_etp16.sh similarity index 53% rename from examples/run_dp_attention_etp16.sh rename to examples/run_dp_with_cached_graph_etp16.sh index b736492411..5f1d3b782b 100644 --- a/examples/run_dp_attention_etp16.sh +++ b/examples/run_dp_with_cached_graph_etp16.sh @@ -1,23 +1,25 @@ -export VLLM_ENABLE_MC2=0 +export HCCL_IF_IP=2.0.0.0 +export GLOO_SOCKET_IFNAME="enp189s0f0" +export TP_SOCKET_IFNAME="enp189s0f0" +export HCCL_SOCKET_IFNAME="enp189s0f0" + export VLLM_USE_V1=1 -export TASK_QUEUE_ENABLE=1 -source /usr/local/Ascend/ascend-toolkit/set_env.sh -source /usr/local/Ascend/nnal/atb/set_env.sh export ASCEND_LAUNCH_BLOCKING=0 -export VLLM_VERSION=0.9.0 +# export VLLM_VERSION=0.9.0 nohup python -m vllm.entrypoints.openai.api_server --model=/mnt/deepseek/DeepSeek-R1-W8A8-VLLM \ + --host 0.0.0.0 \ + --port 20002 \ --quantization ascend \ - --trust-remote-code \ - --distributed-executor-backend=mp \ - --port 8006 \ - -tp=8 \ -dp=2 \ - --max-num-seqs 24 \ - --max-model-len 32768 \ - --max-num-batched-tokens 32768 \ - --block-size 128 \ + -tp=8 \ --no-enable-prefix-caching \ - --additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes":[24]},"ascend_scheduler_config":{"enabled":true},"expert_tensor_parallel_size":16}' \ - --gpu-memory-utilization 0.96 &> run.log & -disown \ No newline at end of file + --max-num-seqs 24 \ + --max-model-len 4096 \ + --max-num-batched-tokens 4096 \ + --gpu-memory-utilization 0.96 \ + --trust-remote-code \ + --distributed-executor-backend=mp \ + --additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes":[24]},"ascend_scheduler_config":{"enabled":true}}' \ + & > run.log & +disown diff --git a/format.sh b/format.sh index e49ac915b9..d8a04069a8 100755 --- a/format.sh +++ b/format.sh @@ -273,7 +273,7 @@ echo 'vllm-ascend isort: Done' # Clang-format section # Exclude some files for formatting because they are vendored CLANG_FORMAT_EXCLUDES=( - 'csrc/kernels/pos_encoding_kernels.cpp' 'csrc/kernels/advance_step.cpp' 'csrc/kernels/get_masked_input_and_mask_kernel.cpp' 'csrc/torch_binding.cpp' 'csrc/ops.h' + 'csrc/kernels/pos_encoding_kernels.cpp' 'csrc/kernels/advance_step.cpp' 'csrc/torch_binding.cpp' 'csrc/ops.h' ) # Format specified files with clang-format diff --git a/pyproject.toml b/pyproject.toml index 514b755c32..3b03a886e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,12 +12,14 @@ requires = [ "scipy", "setuptools>=64", "setuptools-scm>=8", - "torch-npu==2.5.1.post1.dev20250528", + "torch-npu==2.5.1", "torch>=2.5.1", "torchvision<0.21.0", "wheel", "msgpack", "quart", "numba", + # Remove after https://github.com/vllm-project/vllm-ascend/issues/1470 + "transformers<4.53.0", ] build-backend = "setuptools.build_meta" diff --git a/requirements-dev.txt b/requirements-dev.txt index c6f9241f99..133d4604c2 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,7 +4,6 @@ modelscope openai pytest >= 6.0 pytest-asyncio -pytest-mock lm-eval ray types-jsonschema diff --git a/requirements.txt b/requirements.txt index eadb96f1e9..6d84ec658c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,4 +24,7 @@ numba # Install torch_npu --pre --extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi -torch-npu==2.5.1.post1.dev20250528 +torch-npu==2.5.1.post1.dev20250619 + +# Remove after https://github.com/vllm-project/vllm-ascend/issues/1470 +transformers<4.53.0 diff --git a/tests/e2e/common.sh b/tests/e2e/common.sh index 3c61524925..a8b8810108 100644 --- a/tests/e2e/common.sh +++ b/tests/e2e/common.sh @@ -45,30 +45,7 @@ function wait_for_exit() { _info "===> Wait for ${VLLM_PID} to exit." sleep 1 done - _info "===> Process ${VLLM_PID} has exited." -} - -VENV_PATH=/tmp/vllm_venv - -function clean_venv() { - if [[ -n "$VENV_PATH" && -d "$VENV_PATH" ]]; then - _info "Cleaning up default virtual env path: ${VENV_PATH}" - deactivate || true - rm -rf "$VENV_PATH" - fi -} - -function create_vllm_venv() { - # make a clean env path - clean_venv - _info "Creating vllm virtual environment at ${VENV_PATH}" - python3 -m venv ${VENV_PATH} - source ${VENV_PATH}/bin/activate -} - -function get_version() { - local VERSION_NAME="$1" - python3 "${SCRIPT_DIR}/../../docs/source/conf.py" | jq .${VERSION_NAME} | tr -d '"' + _info "===> Wait for ${VLLM_PID} to exit." } SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) diff --git a/tests/e2e/doctests/001-quickstart-test.sh b/tests/e2e/doctests/001-quickstart-test.sh index 44dce0d7bc..0e59255cb9 100755 --- a/tests/e2e/doctests/001-quickstart-test.sh +++ b/tests/e2e/doctests/001-quickstart-test.sh @@ -23,11 +23,13 @@ function simple_test() { } function quickstart_offline_test() { + export VLLM_USE_MODELSCOPE=true # Do real script test python3 "${SCRIPT_DIR}/../../examples/offline_inference_npu.py" } function quickstart_online_test() { + export VLLM_USE_MODELSCOPE=true vllm serve Qwen/Qwen2.5-0.5B-Instruct & wait_url_ready "vllm serve" "localhost:8000/v1/models" # Do real curl test diff --git a/tests/e2e/doctests/002-pip-binary-installation-test.sh b/tests/e2e/doctests/002-pip-binary-installation-test.sh deleted file mode 100644 index 48a33bf6ff..0000000000 --- a/tests/e2e/doctests/002-pip-binary-installation-test.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -# -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -trap clean_venv EXIT - -function install_binary_test() { - - create_vllm_venv - - PIP_VLLM_VERSION=$(get_version pip_vllm_version) - PIP_VLLM_ASCEND_VERSION=$(get_version pip_vllm_ascend_version) - _info "====> Install vllm==${PIP_VLLM_VERSION} and vllm-ascend ${PIP_VLLM_ASCEND_VERSION}" - - pip install vllm=="$(get_version pip_vllm_version)" - pip install vllm-ascend=="$(get_version pip_vllm_ascend_version)" - - pip list | grep vllm - - # Verify the installation - _info "====> Run offline example test" - pip install modelscope - python3 "${SCRIPT_DIR}/../../examples/offline_inference_npu.py" - -} - -_info "====> Start install_binary_test" -install_binary_test diff --git a/tests/e2e/pd_disaggreate/run_edge_case_test.sh b/tests/e2e/pd_disaggreate/run_edge_case_test.sh new file mode 100644 index 0000000000..2f5b1bc8e9 --- /dev/null +++ b/tests/e2e/pd_disaggreate/run_edge_case_test.sh @@ -0,0 +1,141 @@ +#!/bin/bash +export LCCL_DETERMINISTIC=1 +export HCCL_DETERMINISTIC=true +export CLOSE_MATMUL_K_SHIFT=1 +export VLLM_USE_V1=1 + +set -xe + +# Models to run +MODELS=( + "Qwen/Qwen2.5-0.5B-Instruct" +) + +# Find the git repository root directory +GIT_ROOT=$(git rev-parse --show-toplevel) + +# Trap the SIGINT signal (triggered by Ctrl+C) +trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT + +# Gen ranktable +RANKTABLE_PATH=${GIT_ROOT}/examples/disaggregate_prefill_v1/ranktable.json +if [ -f "$RANKTABLE_PATH" ]; then + rm "$RANKTABLE_PATH" +fi +cd ${GIT_ROOT}/examples/disaggregate_prefill_v1 +LOCAL_HOST=`hostname -I|awk -F " " '{print$1}'` +bash gen_ranktable.sh --ips $LOCAL_HOST --network-card-name enp189s0f0 --prefill-device-cnt 1 --decode-device-cnt 1 +cd - +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="$RANKTABLE_PATH" + +# Waits for vLLM to start. +wait_for_server() { + local port=$1 + timeout 1200 bash -c " + until curl -s localhost:${port}/health > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + +# Function to clean up previous instances +cleanup_instances() { + echo "Cleaning up any running vLLM instances..." + pkill -f "vllm serve" || true + sleep 2 +} + +# Handle to get model-specific arguments for deepseek +get_model_args() { + local model_name=$1 + local extra_args="" + + if [[ "$model_name" == *"deepseek"* ]]; then + extra_args="--trust-remote-code" + fi + + echo "$extra_args" +} + + +# Function to run tests for a specific model +run_tests_for_model() { + local model_name=$1 + echo "================================" + echo "Testing model: $model_name" + echo "================================" + + # Get model-specific arguments + local model_args=$(get_model_args "$model_name") + + # Start prefill instance + PREFILL_PORT=8001 + + BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_LLMDD_RPC_PORT=5559 vllm serve $model_name \ + --port $PREFILL_PORT \ + --seed 1024 \ + --enforce-eager \ + --disable-log-requests \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'" + + if [ -n "$model_args" ]; then + FULL_CMD="$BASE_CMD $model_args" + else + FULL_CMD="$BASE_CMD" + fi + + eval "$FULL_CMD &" + + # Start decode instance + DECODE_PORT=8002 + + # Build the command with or without model-specific args + BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_LLMDD_RPC_PORT=6000 vllm serve $model_name \ + --port $DECODE_PORT \ + --seed 1024 \ + --enforce-eager \ + --disable-log-requests \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'" + + if [ -n "$model_args" ]; then + FULL_CMD="$BASE_CMD $model_args" + else + FULL_CMD="$BASE_CMD" + fi + + eval "$FULL_CMD &" + + # Wait for all instances to start + echo "Waiting for prefill instance on port $PORT to start..." + wait_for_server $PREFILL_PORT + echo "Waiting for decode instance on port $PORT to start..." + wait_for_server $DECODE_PORT + + # Build the command for the proxy server with all the hosts and ports + PROXY_PORT=8192 + PROXY_CMD="python ${GIT_ROOT}/examples/disaggregate_prefill_v1/toy_proxy_server.py --port $PROXY_PORT" + PROXY_CMD+=" --prefiller-ports ${PREFILL_PORT}" + PROXY_CMD+=" --decoder-ports ${DECODE_PORT}" + # Start the proxy server + echo "Starting proxy server with command: $PROXY_CMD" + $PROXY_CMD & + + # Wait for the proxy to start + sleep 5 + + # Run lm eval for this model + echo "Running tests for $model_name" + PREFILL_PORT=$PREFILL_PORT DECODE_PORT=$DECODE_PORT PROXY_PORT=$PROXY_PORT python -m pytest -s -v ${GIT_ROOT}/tests/e2e/pd_disaggreate/test_edge_cases.py + + # Clean up before running next model + cleanup_instances + sleep 3 +} + +# Run tests for each model +for model in "${MODELS[@]}"; do + run_tests_for_model "$model" +done + +echo "All tests completed!" \ No newline at end of file diff --git a/tests/e2e/pd_disaggreate/test_edge_cases.py b/tests/e2e/pd_disaggreate/test_edge_cases.py new file mode 100644 index 0000000000..fe53ddc6db --- /dev/null +++ b/tests/e2e/pd_disaggreate/test_edge_cases.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# This code is from: https://github.com/vllm-project/vllm/blob/main/tests/v1/kv_connector/nixl_integration/test_edge_cases.py +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +import os + +import openai + +PREFILL_PORT = os.getenv("PREFILL_PORT", None) +DECODE_PORT = os.getenv("DECODE_PORT", None) +PROXY_PORT = os.getenv("PROXY_PORT", None) + +if PREFILL_PORT is None or DECODE_PORT is None or PROXY_PORT is None: + raise ValueError( + "Please set the PREFILL_PORT, DECODE_PORT, and PROXY_PORT.") + +LONG_PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result, when working on projects like vLLM we are able to meet many amazing people from various organizations like AMD, Google, NVIDIA, " # noqa: E501 +PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result," # noqa: E501 +SHORT_PROMPT = "Red Hat is " + + +def test_edge_cases(): + # Set the OpenAI API key and base URL + decode_client = openai.OpenAI( + api_key="MY_KEY", + base_url=f"http://localhost:{DECODE_PORT}/v1", + ) + prefill_client = openai.OpenAI( + api_key="MY_KEY", + base_url=f"http://localhost:{PREFILL_PORT}/v1", + ) + proxy_client = openai.OpenAI( + api_key="MY_KEY", + base_url=f"http://localhost:{PROXY_PORT}/v1", + ) + + # Get the list of models + models = decode_client.models.list() + MODEL = models.data[0].id + + # (1) Check that we can handle a very short prompt, + # less than the length of the block size. + completion = proxy_client.completions.create(model=MODEL, + prompt=SHORT_PROMPT, + temperature=0) + proxy_response = completion.choices[0].text + completion = prefill_client.completions.create(model=MODEL, + prompt=SHORT_PROMPT, + temperature=0) + prefill_response = completion.choices[0].text + print(f"SMALL PROMPT: {proxy_response=}") + print(f"SMALL PROMPT: {prefill_response=}") + assert proxy_response == prefill_response + + # (2) Check that we can handle a full prefix cache + # hit on the D worker but not on the P worker. + # (2a): prime the D worker. + completion = decode_client.completions.create(model=MODEL, + prompt=PROMPT, + temperature=0) + decode_response = completion.choices[0].text + # (2b): send via the P/D setup + completion = proxy_client.completions.create(model=MODEL, + prompt=PROMPT, + temperature=0) + proxy_response = completion.choices[0].text + print(f"FULL CACHE HIT: {proxy_response=}") + assert proxy_response == decode_response + + # (3) Check that we can handle a partial prefix cache + # hit on the D worker. + completion = proxy_client.completions.create(model=MODEL, + prompt=LONG_PROMPT, + temperature=0) + proxy_response = completion.choices[0].text + completion = prefill_client.completions.create(model=MODEL, + prompt=LONG_PROMPT, + temperature=0) + prefill_response = completion.choices[0].text + print(f"PARTIAL CACHE HIT: {proxy_response=}") + assert proxy_response == prefill_response \ No newline at end of file diff --git a/tests/e2e/run_doctests.sh b/tests/e2e/run_doctests.sh index 2b00b6466c..ee6613ad5d 100755 --- a/tests/e2e/run_doctests.sh +++ b/tests/e2e/run_doctests.sh @@ -21,13 +21,7 @@ set -eo errexit . $(dirname "$0")/common.sh -export VLLM_USE_MODELSCOPE=true -export VLLM_LOGGING_LEVEL=ERROR - _info "====> Start Quickstart test" . "${SCRIPT_DIR}/doctests/001-quickstart-test.sh" -_info "====> Start pip binary installation test" -. "${SCRIPT_DIR}/doctests/002-pip-binary-installation-test.sh" - _info "Doctest passed." diff --git a/tests/e2e/long_term/spec_decode/__init__.py b/tests/long_term/spec_decode_v0/__init__.py similarity index 100% rename from tests/e2e/long_term/spec_decode/__init__.py rename to tests/long_term/spec_decode_v0/__init__.py diff --git a/tests/e2e/long_term/spec_decode/conftest.py b/tests/long_term/spec_decode_v0/conftest.py similarity index 100% rename from tests/e2e/long_term/spec_decode/conftest.py rename to tests/long_term/spec_decode_v0/conftest.py diff --git a/tests/e2e/long_term/spec_decode/e2e/__init__.py b/tests/long_term/spec_decode_v0/e2e/__init__.py similarity index 100% rename from tests/e2e/long_term/spec_decode/e2e/__init__.py rename to tests/long_term/spec_decode_v0/e2e/__init__.py diff --git a/tests/e2e/long_term/spec_decode/e2e/conftest.py b/tests/long_term/spec_decode_v0/e2e/conftest.py similarity index 100% rename from tests/e2e/long_term/spec_decode/e2e/conftest.py rename to tests/long_term/spec_decode_v0/e2e/conftest.py diff --git a/tests/long_term/spec_decode_v0/e2e/test_eagle_correctness.py b/tests/long_term/spec_decode_v0/e2e/test_eagle_correctness.py new file mode 100644 index 0000000000..88ee6bb3d2 --- /dev/null +++ b/tests/long_term/spec_decode_v0/e2e/test_eagle_correctness.py @@ -0,0 +1,344 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/tests/spec_decode/e2e/test_eagle_correctness.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""This docstring details important information on the testing methodology. + +Most of the tests rely on "greedy equality", where we expect the output of +speculative decoding on a sequence to exactly match the output of normal non- +speculative decoding. + +Since speculative decoding with rejection sampling guarantees that the output +distribution matches the target model's output distribution (up to hardware +numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy +equality. + +However, we still need to verify below scenario could be passed: + * Batch size 1 greedy equality + * Batch size >1 greedy equality + * Test greedy equality under preemption + * Test greedy equality under various number of speculative tokens. + +With those tests, we can say at least, EAGLE would not break the +correctness for the target model outputs. +""" + +import pytest + +from tests.long_term.spec_decode_v0.e2e.conftest import \ + run_equality_correctness_test + +# main model +MAIN_MODEL = "JackFram/llama-68m" + +# speculative model +SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random" + +# max. number of speculative tokens: this corresponds to +# num_heads in the config.json of the speculator model. +MAX_SPEC_TOKENS = 4 + +# precision +# TODO The vLLM here uses float32, but some op on the vllm-ascend +# do not support float32, such as ROPE, When it is fixed, it is +# recommended to change this to float32. +PRECISION = "float16" + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Print spec metrics. + "disable_log_stats": False, + + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + }, + }, +]) +@pytest.mark.parametrize("output_len", [ + 128, +]) +@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("seed", [1]) +def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size: int, output_len: int, + seed: int): + + run_equality_correctness_test(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size, output_len, seed) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Print spec metrics. + "disable_log_stats": False, + + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [{ + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + "disable_logprobs": False, + }, +}, { + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + "disable_logprobs": True, + }, +}]) +@pytest.mark.parametrize("output_len", [ + 128, +]) +@pytest.mark.parametrize("batch_size", [8]) +@pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("logprobs", [1, 6]) +def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size: int, output_len: int, seed: int, + logprobs: int): + + run_equality_correctness_test( + vllm_runner, + common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, + test_llm_kwargs, + batch_size, + output_len, + seed, + logprobs=logprobs, + prompt_logprobs=logprobs, + disable_logprobs=test_llm_kwargs["speculative_config"] + ["disable_logprobs"]) + + +@pytest.mark.skipif(True, reason="Open it when graph mode ready.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "enforce_eager": False, + + # Print spec metrics. + "disable_log_stats": False, + + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + }, + }, +]) +@pytest.mark.parametrize("output_len", [ + 128, +]) +@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("seed", [1]) +def test_eagle_e2e_greedy_correctness_cuda_graph( + vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, + seed: int): + """Verify greedy equality with cuda graph enabled and different + batch sizes.""" + run_equality_correctness_test(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size, output_len, seed) + + +@pytest.mark.skipif(True, reason="Open it when preempt ready.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + "block_size": 8, + # 2 for small prompt, 256//8 for generated. + "num_gpu_blocks_override": 2 + 256 // 8, + "max_model_len": (2 + 256 // 8) * 8, + + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [ + { + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + }, + }, +]) +@pytest.mark.parametrize( + "output_len", + [ + # Use small output len for fast test. + 128, + ]) +@pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("seed", [1]) +def test_eagle_e2e_greedy_correctness_with_preemption( + vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, + seed: int): + """Verify greedy equality, even when some sequences are preempted mid- + generation. + """ + run_equality_correctness_test(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size, output_len, seed) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize( + "test_llm_kwargs", + [ + { + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": k, + }, + } + # Try a range of num. speculative tokens + for k in range(1, 1 + MAX_SPEC_TOKENS) + ]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_eagle_different_k(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, baseline_llm_kwargs, + test_llm_kwargs, batch_size: int, output_len: int, + seed: int): + """Verify that eagle speculative decoding produces exact equality + to without spec decode with different values of num_speculative_tokens. + """ + run_equality_correctness_test(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size, output_len, seed) + + +@pytest.mark.parametrize( + "common_llm_kwargs", + [{ + # Skip cuda graph recording for fast test. + "enforce_eager": True, + + # Precision + "dtype": PRECISION, + + # Main model + "model_name": MAIN_MODEL, + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [{ + "speculative_config": { + "model": SPEC_MODEL, + "num_speculative_tokens": MAX_SPEC_TOKENS, + "disable_by_batch_size": 4, + }, +}]) +@pytest.mark.parametrize("batch_size", [1, 5]) +@pytest.mark.parametrize( + "output_len", + [ + # Use smaller output len for fast test. + 32, + ]) +@pytest.mark.parametrize("seed", [1]) +def test_eagle_disable_queue(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, baseline_llm_kwargs, + test_llm_kwargs, batch_size: int, output_len: int, + seed: int): + """Verify that eagle speculative decoding produces exact equality + to without spec decode when speculation is disabled for large + batch sizes. + """ + run_equality_correctness_test(vllm_runner, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size, output_len, seed) + + +if __name__ == "__main__": + import pytest + pytest.main([__file__]) \ No newline at end of file diff --git a/tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py b/tests/long_term/spec_decode_v0/e2e/test_medusa_correctness.py similarity index 99% rename from tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py rename to tests/long_term/spec_decode_v0/e2e/test_medusa_correctness.py index e0c2efd7af..73fcc6c1fc 100644 --- a/tests/e2e/long_term/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/long_term/spec_decode_v0/e2e/test_medusa_correctness.py @@ -41,9 +41,9 @@ import pytest -from tests.e2e.long_term.spec_decode.e2e.conftest import \ +from tests.long_term.spec_decode_v0.e2e.conftest import \ run_equality_correctness_test -from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill +from tests.long_term.spec_decode_v0.utils import maybe_enable_chunked_prefill # main model # lmsys/vicuna-7b-v1.3 was to be used but it's causing diff --git a/tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py b/tests/long_term/spec_decode_v0/e2e/test_mlp_correctness.py similarity index 99% rename from tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py rename to tests/long_term/spec_decode_v0/e2e/test_mlp_correctness.py index 56db617755..12c87b8960 100644 --- a/tests/e2e/long_term/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/long_term/spec_decode_v0/e2e/test_mlp_correctness.py @@ -41,9 +41,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import \ pad_vocab_size # noqa: F401 -from tests.e2e.long_term.spec_decode.e2e.conftest import \ +from tests.long_term.spec_decode_v0.e2e.conftest import \ run_equality_correctness_test -from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill +from tests.long_term.spec_decode_v0.utils import maybe_enable_chunked_prefill # main model MAIN_MODEL = "JackFram/llama-160m" diff --git a/tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py b/tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py similarity index 100% rename from tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py rename to tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py diff --git a/tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py b/tests/long_term/spec_decode_v0/e2e/test_ngram_correctness.py similarity index 99% rename from tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py rename to tests/long_term/spec_decode_v0/e2e/test_ngram_correctness.py index b99187fe37..01e516994a 100644 --- a/tests/e2e/long_term/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/long_term/spec_decode_v0/e2e/test_ngram_correctness.py @@ -44,9 +44,9 @@ import pytest -from tests.e2e.long_term.spec_decode.e2e.conftest import \ +from tests.long_term.spec_decode_v0.e2e.conftest import \ run_equality_correctness_test -from tests.e2e.long_term.spec_decode.utils import maybe_enable_chunked_prefill +from tests.long_term.spec_decode_v0.utils import maybe_enable_chunked_prefill @pytest.mark.parametrize( diff --git a/tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py b/tests/long_term/spec_decode_v0/test_dynamic_spec_decode.py similarity index 96% rename from tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py rename to tests/long_term/spec_decode_v0/test_dynamic_spec_decode.py index 8e9480ea26..d36f9c94ae 100644 --- a/tests/e2e/long_term/spec_decode/test_dynamic_spec_decode.py +++ b/tests/long_term/spec_decode_v0/test_dynamic_spec_decode.py @@ -27,8 +27,8 @@ from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler -from tests.e2e.long_term.spec_decode.utils import create_batch, mock_worker +from tests.long_term.spec_decode_v0.test_utils import mock_spec_decode_sampler +from tests.long_term.spec_decode_v0.utils import create_batch, mock_worker @pytest.mark.parametrize('queue_size', [4]) diff --git a/tests/e2e/long_term/spec_decode/test_multi_step_worker.py b/tests/long_term/spec_decode_v0/test_multi_step_worker.py similarity index 99% rename from tests/e2e/long_term/spec_decode/test_multi_step_worker.py rename to tests/long_term/spec_decode_v0/test_multi_step_worker.py index b3017a987e..11d255be5e 100644 --- a/tests/e2e/long_term/spec_decode/test_multi_step_worker.py +++ b/tests/long_term/spec_decode_v0/test_multi_step_worker.py @@ -29,7 +29,7 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from tests.e2e.long_term.spec_decode.utils import ( +from tests.long_term.spec_decode_v0.utils import ( assert_logprobs_dict_allclose, create_batch, create_seq_group_metadata_from_prompts, create_worker, patch_execute_model_with_seeds, zero_kv_cache) diff --git a/tests/e2e/long_term/spec_decode/test_ngram_worker.py b/tests/long_term/spec_decode_v0/test_ngram_worker.py similarity index 99% rename from tests/e2e/long_term/spec_decode/test_ngram_worker.py rename to tests/long_term/spec_decode_v0/test_ngram_worker.py index 078a4d2bed..8b7cbd61f1 100644 --- a/tests/e2e/long_term/spec_decode/test_ngram_worker.py +++ b/tests/long_term/spec_decode_v0/test_ngram_worker.py @@ -22,7 +22,7 @@ from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from tests.e2e.long_term.spec_decode.utils import ( +from tests.long_term.spec_decode_v0.utils import ( create_seq_group_metadata_from_prompts, create_worker) diff --git a/tests/e2e/long_term/spec_decode/test_spec_decode_worker.py b/tests/long_term/spec_decode_v0/test_spec_decode_worker.py similarity index 99% rename from tests/e2e/long_term/spec_decode/test_spec_decode_worker.py rename to tests/long_term/spec_decode_v0/test_spec_decode_worker.py index 94a1bcf1e7..87db67b099 100644 --- a/tests/e2e/long_term/spec_decode/test_spec_decode_worker.py +++ b/tests/long_term/spec_decode_v0/test_spec_decode_worker.py @@ -35,10 +35,10 @@ from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, split_num_cache_blocks_evenly) -from tests.e2e.long_term.spec_decode.test_utils import mock_spec_decode_sampler -from tests.e2e.long_term.spec_decode.utils import (create_batch, - create_sampler_output_list, - create_worker, mock_worker) +from tests.long_term.spec_decode_v0.test_utils import mock_spec_decode_sampler +from tests.long_term.spec_decode_v0.utils import (create_batch, + create_sampler_output_list, + create_worker, mock_worker) from vllm_ascend.worker.draft_model_runner import TP1DraftModelRunner from vllm_ascend.worker.worker import NPUWorker diff --git a/tests/e2e/long_term/spec_decode/test_utils.py b/tests/long_term/spec_decode_v0/test_utils.py similarity index 100% rename from tests/e2e/long_term/spec_decode/test_utils.py rename to tests/long_term/spec_decode_v0/test_utils.py diff --git a/tests/e2e/long_term/spec_decode/utils.py b/tests/long_term/spec_decode_v0/utils.py similarity index 100% rename from tests/e2e/long_term/spec_decode/utils.py rename to tests/long_term/spec_decode_v0/utils.py diff --git a/tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py b/tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py similarity index 54% rename from tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py rename to tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py index 2219a6f552..3b5e1986f2 100644 --- a/tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py +++ b/tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py @@ -63,7 +63,10 @@ def test_mtp_correctness( with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True) + ref_llm = LLM(model=model_name, + max_model_len=256, + gpu_memory_utilization=0.8, + enforce_eager=True) ref_outputs = ref_llm.chat(test_prompts, sampling_config) del ref_llm @@ -74,6 +77,7 @@ def test_mtp_correctness( "num_speculative_tokens": 1, }, max_model_len=256, + gpu_memory_utilization=0.8, enforce_eager=True) spec_outputs = spec_llm.chat(test_prompts, sampling_config) matches = 0 @@ -90,3 +94,62 @@ def test_mtp_correctness( # Upon failure, inspect the outputs to check for inaccuracy. assert matches > int(0.66 * len(ref_outputs)) del spec_llm + + +def test_mtp_torchair_correctness( + monkeypatch: pytest.MonkeyPatch, + test_prompts: list[list[dict[str, Any]]], + sampling_config: SamplingParams, + model_name: str, +): + ''' + Compare the outputs of a original LLM and a speculative LLM + should be the same when using mtp speculative decoding. + ''' + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + ref_llm = LLM(model=model_name, + max_model_len=256, + enforce_eager=False, + additional_config={ + "torchair_graph_config": { + "enabled": True + }, + "ascend_scheduler_config": { + "enabled": True + }, + }) + ref_outputs = ref_llm.chat(test_prompts, sampling_config) + del ref_llm + + spec_llm = LLM(model=model_name, + trust_remote_code=True, + enforce_eager=False, + speculative_config={ + "method": "deepseek_mtp", + "num_speculative_tokens": 1, + }, + additional_config={ + "torchair_graph_config": { + "enabled": True + }, + "ascend_scheduler_config": { + "enabled": True + }, + }) + spec_outputs = spec_llm.chat(test_prompts, sampling_config) + matches = 0 + misses = 0 + for ref_output, spec_output in zip(ref_outputs, spec_outputs): + if ref_output.outputs[0].text == spec_output.outputs[0].text: + matches += 1 + else: + misses += 1 + print(f"ref_output: {ref_output.outputs[0].text}") + print(f"spec_output: {spec_output.outputs[0].text}") + + # Heuristic: expect at least 66% of the prompts to match exactly + # Upon failure, inspect the outputs to check for inaccuracy. + assert matches > int(0.66 * len(ref_outputs)) + del spec_llm diff --git a/tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py b/tests/long_term/spec_decode_v1/test_v1_spec_decode.py similarity index 100% rename from tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py rename to tests/long_term/spec_decode_v1/test_v1_spec_decode.py diff --git a/tests/e2e/long_term/test_accuracy.py b/tests/long_term/test_accuracy.py similarity index 100% rename from tests/e2e/long_term/test_accuracy.py rename to tests/long_term/test_accuracy.py diff --git a/tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py b/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py similarity index 97% rename from tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py rename to tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py index 27986cb149..3a9068ff6b 100644 --- a/tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py +++ b/tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py @@ -38,7 +38,7 @@ def run_test(model_name, queue, more_args=None): - model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4" + model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4,enforce_eager=True" if more_args is not None: model_args = f"{model_args},{more_args}" results = lm_eval.simple_evaluate( diff --git a/tests/multicard/test_data_parallel.py b/tests/multicard/test_data_parallel.py index 6c0a20de97..cabeac2846 100644 --- a/tests/multicard/test_data_parallel.py +++ b/tests/multicard/test_data_parallel.py @@ -16,7 +16,6 @@ # """ Compare the outputs of vLLM with and without aclgraph. - Run `pytest tests/multicard/test_data_parallel.py`. """ @@ -30,6 +29,7 @@ MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"] +@pytest.mark.skipif(True, reason="OPEN ME when dp is supported on A2") @pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", reason="Data parallel only support on v1") @pytest.mark.parametrize("model", MODELS) diff --git a/tests/e2e/multicard/test_dynamic_npugraph_batchsize.py b/tests/multicard/test_dynamic_npugraph_batchsize.py similarity index 100% rename from tests/e2e/multicard/test_dynamic_npugraph_batchsize.py rename to tests/multicard/test_dynamic_npugraph_batchsize.py diff --git a/tests/e2e/multicard/test_ilama_lora_tp2.py b/tests/multicard/test_ilama_lora_tp2.py similarity index 83% rename from tests/e2e/multicard/test_ilama_lora_tp2.py rename to tests/multicard/test_ilama_lora_tp2.py index e743141b7a..e61ce250c8 100644 --- a/tests/e2e/multicard/test_ilama_lora_tp2.py +++ b/tests/multicard/test_ilama_lora_tp2.py @@ -1,8 +1,8 @@ import pytest from tests.conftest import VllmRunner -from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, - MODEL_PATH, do_sample) +from tests.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, MODEL_PATH, + do_sample) @pytest.mark.parametrize("distributed_executor_backend", ["mp"]) diff --git a/tests/multicard/test_model_qwen3_w4a8.py b/tests/multicard/test_model_qwen3_w4a8.py new file mode 100644 index 0000000000..e059863638 --- /dev/null +++ b/tests/multicard/test_model_qwen3_w4a8.py @@ -0,0 +1,65 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +"""Compare the outputs of vLLM when using W4A8 quantization on qwen3 models. + +Run `pytest tests/multicard/test_model_qwen3_w4a8.py`. +""" +import os + +import pytest +from modelscope import snapshot_download # type: ignore +from vllm import LLM, SamplingParams + +MODELS = ["vllm-ascend/Qwen3-8B-W4A8"] +PROMPTS = [ + "Hello, my name is", + "The future of AI is", +] + + +@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", + reason="w4a8_dynamic is not supported on v0") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [16]) +def test_qwen3_model_with_w4a8_linear_method(model: str, + max_tokens: int) -> None: + messages = [[{"role": "user", "content": prompt}] for prompt in PROMPTS] + sampling_params = SamplingParams( + max_tokens=max_tokens, + temperature=0.0, + ) + llm = LLM( + model=snapshot_download(model), + max_model_len=1024, + tensor_parallel_size=2, + enforce_eager=True, + quantization="ascend", + ) + vllm_outputs = llm.chat( + messages, + sampling_params, + chat_template_kwargs={"enable_thinking": False}, + ) + golden_outputs = [ + "Hello! My name is Qwen, and I'm a large language model developed", + "The future of AI is a topic of great interest and debate, with many possibilities", + ] + assert len(vllm_outputs) == len(golden_outputs) + for vllm_output, golden_output in zip(vllm_outputs, golden_outputs): + assert vllm_output.outputs[0].text == golden_output + print(f"Generated text: {vllm_output.outputs[0].text!r}") diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py similarity index 72% rename from tests/e2e/multicard/test_offline_inference_distributed.py rename to tests/multicard/test_offline_inference_distributed.py index f5ec2c872b..df69ff8e3d 100644 --- a/tests/e2e/multicard/test_offline_inference_distributed.py +++ b/tests/multicard/test_offline_inference_distributed.py @@ -23,8 +23,10 @@ import os from unittest.mock import patch +import pytest from modelscope import snapshot_download # type: ignore from vllm import SamplingParams +from vllm.model_executor.models.registry import ModelRegistry from tests.conftest import VllmRunner @@ -83,6 +85,10 @@ def test_models_distributed_topk() -> None: vllm_model.generate(example_prompts, sampling_params) +@pytest.mark.skip( + reason= + "deepseek dbo dose not consider the support on half precision float, will enable this ut after we actually support it" +) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"}) def test_models_distributed_DeepSeek_dbo(): example_prompts = ["The president of the United States is"] * 41 @@ -94,6 +100,33 @@ def test_models_distributed_DeepSeek_dbo(): tensor_parallel_size=4, distributed_executor_backend="mp", ) as vllm_model: + model_arch = 'DeepseekV2ForCausalLM' + registed_models = ModelRegistry.models + assert registed_models[ + model_arch].module_name == "vllm_ascend.models.deepseek_dbo" + assert registed_models[ + model_arch].class_name == "CustomDeepseekDBOForCausalLM" + vllm_model.generate(example_prompts, sampling_params) + + +@pytest.mark.skip(reason="Due to OOM,waiting for 1311pr to merge in") +@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"}) +def test_models_distributed_DeepSeekV3_dbo(): + example_prompts = ["The president of the United States is"] * 41 + dtype = "half" + sampling_params = SamplingParams(max_tokens=100, temperature=0.0) + with VllmRunner( + "vllm-ascend/DeepSeek-V3-Pruning", + dtype=dtype, + tensor_parallel_size=4, + distributed_executor_backend="mp", + ) as vllm_model: + model_arch = 'DeepseekV3ForCausalLM' + registed_models = ModelRegistry.models + assert registed_models[ + model_arch].module_name == "vllm_ascend.models.deepseek_dbo" + assert registed_models[ + model_arch].class_name == "CustomDeepseekDBOForCausalLM" vllm_model.generate(example_prompts, sampling_params) diff --git a/tests/e2e/multicard/test_pyhccl_distributed.py b/tests/multicard/test_pyhccl_distributed.py similarity index 100% rename from tests/e2e/multicard/test_pyhccl_distributed.py rename to tests/multicard/test_pyhccl_distributed.py diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/multicard/test_torchair_graph_mode.py similarity index 85% rename from tests/e2e/multicard/test_torchair_graph_mode.py rename to tests/multicard/test_torchair_graph_mode.py index d06ec7de22..96fa92ef4b 100644 --- a/tests/e2e/multicard/test_torchair_graph_mode.py +++ b/tests/multicard/test_torchair_graph_mode.py @@ -30,10 +30,13 @@ @pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", reason="torchair graph is not supported on v0") -def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch): +@pytest.mark.parametrize("VLLM_ASCEND_ENABLE_DBO", ["0", "1"]) +def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch, + VLLM_ASCEND_ENABLE_DBO): with monkeypatch.context() as m: m.setenv("VLLM_USE_MODELSCOPE", "True") m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + m.setenv("VLLM_ASCEND_ENABLE_DBO", VLLM_ASCEND_ENABLE_DBO) example_prompts = [ "Hello, my name is", @@ -68,10 +71,10 @@ def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch): # inaccurate. This will only change if accuracy improves with the # official weights of DeepSeek-V3. golden_results = [ - 'Hello, my name is feasibility伸 spazio debtor添', - 'The president of the United States is begg"""\n杭州风和 bestimm', - 'The capital of France is frequentlyশามalinkAllowed', - 'The future of AI is deleting俯احت怎么样了حراف', + 'Hello, my name is下载早点向前很有่อง', + 'The president of the United States isSender)## physiological Albany', + 'The capital of France is Rocky转角 hospitalizedinterval sparked', + 'The future of AI is её asegο BIOS一扫', ] assert len(golden_results) == len(vllm_output) diff --git a/tests/multicard/test_w4a8_deepseek.py b/tests/multicard/test_w4a8_deepseek.py new file mode 100644 index 0000000000..98a6f0c17e --- /dev/null +++ b/tests/multicard/test_w4a8_deepseek.py @@ -0,0 +1,67 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest + +from tests.conftest import VllmRunner + + +@pytest.mark.skip(reason="Due to OOM,waiting for 1311pr to merge in") +@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", + reason="w4a8_dynamic is not supported on v0") +def test_deepseek_W4A8(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + dtype = "bfloat16" + max_tokens = 5 + with VllmRunner( + "vllm-ascend/DeepSeek-R1-w4a8-pruning", + dtype=dtype, + tensor_parallel_size=2, + enforce_eager=True, + quantization="ascend", + enable_expert_parallel=True, + additional_config={ + "torchair_graph_config": { + "enabled": False, + }, + "ascend_scheduler_config": { + "enabled": True, + } + }, + ) as vllm_model: + # use greedy sampler to make sure the generated results are fix + vllm_output = vllm_model.generate_greedy(prompts, max_tokens) + + golden_results = [ + 'Hello, my name is逸研究发现IPPudsimentary', + 'The president of the United States is逸 Ban Corporealistically', + 'The capital of France is逸 Ban Corporealistically', + 'The future of AI is逸 Ban Corporealistically', + ] + assert len(golden_results) == len(vllm_output) + for i in range(len(vllm_output)): + assert golden_results[i] == vllm_output[i][1] + print(f"Generated text: {vllm_output[i][1]!r}") diff --git a/tests/e2e/singlecard/ops/test_vocabparallelembedding.py b/tests/ops/test_vocabparallelembedding.py similarity index 100% rename from tests/e2e/singlecard/ops/test_vocabparallelembedding.py rename to tests/ops/test_vocabparallelembedding.py diff --git a/tests/e2e/singlecard/__init__.py b/tests/singlecard/__init__.py similarity index 100% rename from tests/e2e/singlecard/__init__.py rename to tests/singlecard/__init__.py diff --git a/tests/e2e/singlecard/compile/__init__.py b/tests/singlecard/compile/__init__.py similarity index 100% rename from tests/e2e/singlecard/compile/__init__.py rename to tests/singlecard/compile/__init__.py diff --git a/tests/e2e/singlecard/compile/test_simple.py b/tests/singlecard/compile/test_simple.py similarity index 100% rename from tests/e2e/singlecard/compile/test_simple.py rename to tests/singlecard/compile/test_simple.py diff --git a/tests/e2e/singlecard/core/__init__.py b/tests/singlecard/core/__init__.py similarity index 100% rename from tests/e2e/singlecard/core/__init__.py rename to tests/singlecard/core/__init__.py diff --git a/tests/e2e/singlecard/core/test_ascend_scheduler.py b/tests/singlecard/core/test_ascend_scheduler.py similarity index 95% rename from tests/e2e/singlecard/core/test_ascend_scheduler.py rename to tests/singlecard/core/test_ascend_scheduler.py index 7d9c1b1ef5..c382ebdf40 100644 --- a/tests/e2e/singlecard/core/test_ascend_scheduler.py +++ b/tests/singlecard/core/test_ascend_scheduler.py @@ -98,7 +98,11 @@ def create_scheduler( ) kv_cache_config = KVCacheConfig( num_blocks=num_blocks, # A large number of blocks to hold all requests - kv_cache_tensors=[], + **({ + "tensors": {} + } if vllm_version_is("0.9.0") else { + "kv_cache_tensors": [] + }), kv_cache_groups=[ KVCacheGroupSpec(['layer'], FullAttentionSpec(block_size, 1, 1, torch.float32, @@ -141,8 +145,8 @@ def create_requests(num_requests: int, multi_modal_hashes=None, eos_token_id=EOS_TOKEN_ID, **({ - "pooling_params": None - } if not vllm_version_is("0.9.1") else {}), + "arrival_time": 0.0 + } if vllm_version_is("0.9.0") else {}), ) requests.append(request) return requests @@ -258,9 +262,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) + ) scheduler.update_from_output(output, model_runner_output) # Schedule the next step. All three requests are running. @@ -284,10 +286,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) - + ) scheduler.update_from_output(output1, model_runner_output) output2 = scheduler.schedule() assert len(scheduler.running) == 3 @@ -338,10 +337,7 @@ def test_stop_via_update_from_output(): 11]], # First request hits EOS, second continues spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) + prompt_logprobs_dict={}) scheduler.update_from_output(scheduler_output, model_output) @@ -389,10 +385,7 @@ def test_stop_via_update_from_output(): [13, 14]], # First request hits stop token spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) + prompt_logprobs_dict={}) scheduler.update_from_output(scheduler_output, model_output) @@ -439,10 +432,7 @@ def test_stop_via_update_from_output(): [13]], # First request exceeds max_tokens spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) + prompt_logprobs_dict={}) scheduler.update_from_output(scheduler_output, model_output) @@ -484,10 +474,7 @@ def test_stop_via_update_from_output(): sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) + prompt_logprobs_dict={}) scheduler.update_from_output(scheduler_output, model_output) @@ -537,10 +524,7 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool], spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) - + ) scheduler.update_from_output(scheduler_output0, model_runner_output) # Schedule the next step. @@ -557,10 +541,7 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool], spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) - + ) scheduler.update_from_output(scheduler_output1, model_runner_output) @@ -584,6 +565,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): 1. Speculated tokens get scheduled correctly 2. Spec decoding stats properly count number of draft and accepted tokens """ + if vllm_version_is("0.9.0"): + return num_spec_tokens = max(1, max(len(t) for t in spec_tokens)) scheduler = create_scheduler(num_speculative_tokens=num_spec_tokens) requests = create_requests(num_requests=len(spec_tokens), num_tokens=1) @@ -610,10 +593,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): spec_token_ids=spec_tokens, logprobs=None, prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) - + ) engine_core_outputs = scheduler.update_from_output(output, model_runner_output) @@ -652,10 +632,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) - + ) engine_core_outputs = scheduler.update_from_output(output, model_runner_output) @@ -750,9 +727,7 @@ def make_output(scheduler: AscendScheduler): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) + ) def assert_scheduler_empty(scheduler: AscendScheduler): @@ -769,10 +744,11 @@ def assert_scheduler_empty(scheduler: AscendScheduler): assert len(scheduler.encoder_cache_manager.cached) == 0 # KVCache Manager. - assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. - req_to_blocks) == 0 - assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. - num_cached_block) == 0 + if not vllm_version_is("0.9.0"): + assert len(scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks) == 0 + assert len(scheduler.kv_cache_manager.coordinator. + single_type_managers[0].num_cached_block) == 0 assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0 num_free_blocks = ( scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) @@ -813,4 +789,4 @@ def test_memory_leak(): scheduler.update_from_output(scheduler_output, model_runner_output) # Confirm no memory leak. - assert_scheduler_empty(scheduler) + assert_scheduler_empty(scheduler) \ No newline at end of file diff --git a/tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py b/tests/singlecard/core/test_ascend_scheduler_e2e.py similarity index 100% rename from tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py rename to tests/singlecard/core/test_ascend_scheduler_e2e.py diff --git a/tests/e2e/singlecard/ops/__init__.py b/tests/singlecard/ops/__init__.py similarity index 100% rename from tests/e2e/singlecard/ops/__init__.py rename to tests/singlecard/ops/__init__.py diff --git a/tests/e2e/singlecard/ops/test_fused_moe.py b/tests/singlecard/ops/test_fused_moe.py similarity index 100% rename from tests/e2e/singlecard/ops/test_fused_moe.py rename to tests/singlecard/ops/test_fused_moe.py diff --git a/tests/e2e/singlecard/ops/test_multi_step.py b/tests/singlecard/ops/test_multi_step.py similarity index 100% rename from tests/e2e/singlecard/ops/test_multi_step.py rename to tests/singlecard/ops/test_multi_step.py diff --git a/tests/e2e/singlecard/ops/test_rotary_embedding.py b/tests/singlecard/ops/test_rotary_embedding.py similarity index 100% rename from tests/e2e/singlecard/ops/test_rotary_embedding.py rename to tests/singlecard/ops/test_rotary_embedding.py diff --git a/tests/e2e/singlecard/sample/__init__.py b/tests/singlecard/sample/__init__.py similarity index 100% rename from tests/e2e/singlecard/sample/__init__.py rename to tests/singlecard/sample/__init__.py diff --git a/tests/e2e/singlecard/sample/test_rejection_sampler.py b/tests/singlecard/sample/test_rejection_sampler.py similarity index 100% rename from tests/e2e/singlecard/sample/test_rejection_sampler.py rename to tests/singlecard/sample/test_rejection_sampler.py diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/singlecard/test_aclgraph.py similarity index 88% rename from tests/e2e/singlecard/test_aclgraph.py rename to tests/singlecard/test_aclgraph.py index e0bfb65cf8..fb02555956 100644 --- a/tests/e2e/singlecard/test_aclgraph.py +++ b/tests/singlecard/test_aclgraph.py @@ -36,9 +36,11 @@ reason="aclgraph only support on v1") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("full_graph", [False]) def test_models( model: str, max_tokens: int, + full_graph: bool, monkeypatch: pytest.MonkeyPatch, ) -> None: with monkeypatch.context() as m: @@ -54,7 +56,15 @@ def test_models( temperature=0.0) # TODO: change to use vllmrunner when the registry of custom op is solved # while running pytest - vllm_model = LLM(model) + if full_graph: + vllm_model = LLM(model, + compilation_config={ + "full_cuda_graph": True, + "cudagraph_capture_sizes": + [1, 4, 16, 64, 256] + }) + else: + vllm_model = LLM(model) vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params) del vllm_model torch.npu.empty_cache() diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py new file mode 100644 index 0000000000..7344371d87 --- /dev/null +++ b/tests/singlecard/test_ascend_config.py @@ -0,0 +1,188 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pytest + +from tests.conftest import VllmRunner +from vllm_ascend.ascend_config import (clear_ascend_config, get_ascend_config, + init_ascend_config) + + +def _clean_up_ascend_config(func): + + def wrapper(*args, **kwargs): + clear_ascend_config() + func(*args, **kwargs) + clear_ascend_config() + + return wrapper + + +@_clean_up_ascend_config +def test_run_without_ascend_config(): + with VllmRunner("facebook/opt-125m"): + ascend_config = get_ascend_config() + + assert not ascend_config.torchair_graph_config.enabled + assert not ascend_config.torchair_graph_config.use_cached_graph + assert ascend_config.torchair_graph_config.graph_batch_sizes == [] + assert not ascend_config.torchair_graph_config.graph_batch_sizes_init + assert not ascend_config.ascend_scheduler_config.enabled + + +@_clean_up_ascend_config +def test_run_with_ascend_config(): + if os.getenv("VLLM_USE_V1") == "0": + pytest.skip("graph only works on v1") + + input_additional_config_1 = { + "torchair_graph_config": { + # torchair graph only works with deepseek. The e2e test should be added + # in multicard test with deepseek models. + "enabled": False, + "use_cached_graph": True, + "graph_batch_sizes": [1, 2, 4, 8], + "graph_batch_sizes_init": False, + "enable_multistream_moe": True, + "enable_multistream_mla": True, + }, + "ascend_scheduler_config": { + "enabled": True, + "enable_chunked_prefill": True, + }, + } + + # check passed with eager mode + with VllmRunner("facebook/opt-125m", + enforce_eager=True, + additional_config=input_additional_config_1): + ascend_config = get_ascend_config() + + assert not ascend_config.torchair_graph_config.enabled + assert ascend_config.torchair_graph_config.use_cached_graph + assert ascend_config.torchair_graph_config.graph_batch_sizes == [ + 1, 2, 4, 8 + ] + assert not ascend_config.torchair_graph_config.graph_batch_sizes_init + assert ascend_config.torchair_graph_config.enable_multistream_mla + assert ascend_config.torchair_graph_config.enable_multistream_moe + assert ascend_config.ascend_scheduler_config.enabled + assert ascend_config.ascend_scheduler_config.enable_chunked_prefill + + +@_clean_up_ascend_config +def test_ascend_config_init_error(): + # ascend_config should be initialized first + with pytest.raises(RuntimeError): + _ = get_ascend_config() + + +@_clean_up_ascend_config +def test_ascend_config_load_error(): + if os.getenv("VLLM_USE_V1") == "0": + pytest.skip("graph only works on v1") + # graph_batch_sizes should be list. + with pytest.raises(TypeError): + input_additional_config_fake_1 = { + "torchair_graph_config": { + "graph_batch_sizes": "fake_size", + }, + } + with VllmRunner("facebook/opt-125m", + additional_config=input_additional_config_fake_1): + pass + + # graph_batch_sizes_init should not be True when graph_batch_sizes is not empty. + with pytest.raises(ValueError): + input_additional_config_fake_2 = { + "torchair_graph_config": { + "graph_batch_sizes": [1, 2, 4, 8], + "graph_batch_sizes_init": True, + }, + } + with VllmRunner("facebook/opt-125m", + additional_config=input_additional_config_fake_2): + pass + + # torchair graph only works with deepseek. + with pytest.raises(NotImplementedError): + input_additional_config_fake_2 = { + "torchair_graph_config": { + "enabled": True, + }, + } + with VllmRunner("facebook/opt-125m", + enforce_eager=False, + additional_config=input_additional_config_fake_2): + pass + + # torchair graph should not be enabled with eager mode + with pytest.raises(RuntimeError): + input_additional_config_fake_3 = { + "torchair_graph_config": { + "enabled": True, + }, + } + with VllmRunner("facebook/opt-125m", + enforce_eager=True, + additional_config=input_additional_config_fake_3): + pass + + +@_clean_up_ascend_config +def test_check_ascend_config_v0(): + if os.getenv("VLLM_USE_V1") == "1": + pytest.skip("graph only works on v1, this is the test for v0") + with pytest.raises(NotImplementedError): + input_additional_config_fake_1 = { + "torchair_graph_config": { + "enabled": True, + }, + } + with VllmRunner("facebook/opt-125m", + additional_config=input_additional_config_fake_1): + pass + + +@_clean_up_ascend_config +def test_ascend_config_refresh(): + from vllm.config import get_current_vllm_config + vllm_config = get_current_vllm_config() + # set additional_config with none + init_ascend_config(vllm_config) + + input_additional_config = { + "torchair_graph_config": { + "enabled": False, + "use_cached_graph": True, + "graph_batch_sizes": [1, 2, 4, 8], + "graph_batch_sizes_init": False, + }, + "refresh": True, + } + + # refresh ascend config + with VllmRunner("facebook/opt-125m", + additional_config=input_additional_config): + ascend_config = get_ascend_config() + + assert not ascend_config.torchair_graph_config.enabled + assert ascend_config.torchair_graph_config.use_cached_graph + assert ascend_config.torchair_graph_config.graph_batch_sizes == [ + 1, 2, 4, 8 + ] + assert not ascend_config.torchair_graph_config.graph_batch_sizes_init diff --git a/tests/e2e/singlecard/test_camem.py b/tests/singlecard/test_camem.py similarity index 100% rename from tests/e2e/singlecard/test_camem.py rename to tests/singlecard/test_camem.py diff --git a/tests/e2e/singlecard/test_chunked.py b/tests/singlecard/test_chunked.py similarity index 100% rename from tests/e2e/singlecard/test_chunked.py rename to tests/singlecard/test_chunked.py diff --git a/tests/e2e/singlecard/test_guided_decoding.py b/tests/singlecard/test_guided_decoding.py similarity index 100% rename from tests/e2e/singlecard/test_guided_decoding.py rename to tests/singlecard/test_guided_decoding.py diff --git a/tests/e2e/singlecard/test_ilama_lora.py b/tests/singlecard/test_ilama_lora.py similarity index 100% rename from tests/e2e/singlecard/test_ilama_lora.py rename to tests/singlecard/test_ilama_lora.py diff --git a/tests/e2e/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py similarity index 98% rename from tests/e2e/singlecard/test_offline_inference.py rename to tests/singlecard/test_offline_inference.py index de69612279..cd65a24969 100644 --- a/tests/e2e/singlecard/test_offline_inference.py +++ b/tests/singlecard/test_offline_inference.py @@ -43,6 +43,10 @@ ] os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" +QUANTIZATION_MODELS = [ + "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8", +] + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half", "float16"]) diff --git a/tests/e2e/singlecard/test_profile_execute_duration.py b/tests/singlecard/test_profile_execute_duration.py similarity index 100% rename from tests/e2e/singlecard/test_profile_execute_duration.py rename to tests/singlecard/test_profile_execute_duration.py diff --git a/tests/e2e/singlecard/test_prompt_embedding.py b/tests/singlecard/test_prompt_embedding.py similarity index 100% rename from tests/e2e/singlecard/test_prompt_embedding.py rename to tests/singlecard/test_prompt_embedding.py diff --git a/tests/e2e/singlecard/test_pyhccl.py b/tests/singlecard/test_pyhccl.py similarity index 100% rename from tests/e2e/singlecard/test_pyhccl.py rename to tests/singlecard/test_pyhccl.py diff --git a/tests/e2e/singlecard/test_sampler.py b/tests/singlecard/test_sampler.py similarity index 100% rename from tests/e2e/singlecard/test_sampler.py rename to tests/singlecard/test_sampler.py diff --git a/tests/e2e/singlecard/test_scheduler.py b/tests/singlecard/test_scheduler.py similarity index 95% rename from tests/e2e/singlecard/test_scheduler.py rename to tests/singlecard/test_scheduler.py index b3adf945bf..8021f0306c 100644 --- a/tests/e2e/singlecard/test_scheduler.py +++ b/tests/singlecard/test_scheduler.py @@ -31,7 +31,6 @@ from vllm.v1.structured_output import StructuredOutputManager from vllm_ascend.core.scheduler import AscendScheduler -from vllm_ascend.utils import vllm_version_is EOS_TOKEN_ID = 50256 @@ -131,9 +130,6 @@ def create_requests(num_requests: int, multi_modal_placeholders=mm_position, multi_modal_hashes=None, eos_token_id=EOS_TOKEN_ID, - **({ - "pooling_params": None - } if not vllm_version_is("0.9.1") else {}), ) requests.append(request) return requests @@ -241,10 +237,7 @@ def test_stop_via_update_from_output(): 11]], # First request hits EOS, second continues spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) + prompt_logprobs_dict={}) scheduler.update_from_output(scheduler_output, model_output) @@ -290,10 +283,7 @@ def test_stop_via_update_from_output(): [13, 14]], # First request hits stop token spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) + prompt_logprobs_dict={}) scheduler.update_from_output(scheduler_output, model_output) @@ -338,10 +328,7 @@ def test_stop_via_update_from_output(): [13]], # First request exceeds max_tokens spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) + prompt_logprobs_dict={}) scheduler.update_from_output(scheduler_output, model_output) @@ -382,10 +369,7 @@ def test_stop_via_update_from_output(): sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}, - **({ - "pooler_output": [] - } if not vllm_version_is("0.9.1") else {})) + prompt_logprobs_dict={}) scheduler.update_from_output(scheduler_output, model_output) diff --git a/tests/ut/fake_weight/config.json b/tests/ut/fake_weight/config.json deleted file mode 100644 index b3fb716a30..0000000000 --- a/tests/ut/fake_weight/config.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "_name_or_path": "facebook/opt-125m", - "activation_dropout": 0.0, - "activation_function": "relu", - "architectures": [ - "OPTForCausalLM" - ], - "attention_dropout": 0.0, - "bos_token_id": 2, - "do_layer_norm_before": true, - "dropout": 0.1, - "eos_token_id": 2, - "ffn_dim": 3072, - "hidden_size": 768, - "init_std": 0.02, - "layerdrop": 0.0, - "max_position_embeddings": 2048, - "model_type": "opt", - "num_attention_heads": 12, - "num_hidden_layers": 12, - "pad_token_id": 1, - "prefix": "", - "torch_dtype": "float16", - "transformers_version": "4.21.0.dev0", - "use_cache": true, - "vocab_size": 50272, - "word_embed_proj_dim": 768 -} diff --git a/tests/ut/kv_connector/test_llmdatadist_connector.py b/tests/ut/kv_connector/test_llmdatadist_connector.py new file mode 100644 index 0000000000..94650f43e9 --- /dev/null +++ b/tests/ut/kv_connector/test_llmdatadist_connector.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. + +from tests.ut.kv_connector.utils import (create_request, create_scheduler, + create_vllm_config) +from vllm_ascend.distributed.llmdatadist_c_mgr_connector import \ + LLMDataDistCMgrConnectorMetadata + + +def test_basic_inferface(): + """Unit test for basic LLMDataDistCMgrConnector interface functionality.""" + + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + + # 2 Full Blocks and 1 Half Block. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 2 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + + request = create_request(request_id=1, + num_tokens=NUM_TOKENS, + do_remote_prefill=True) + request_id = request.request_id + + scheduler.add_request(request) + + # Remote Prefill, triggers LLMDataDistCMgrConnectorMetadata. + scheduler_output = scheduler.schedule() + kv_connector_metadata = scheduler_output.kv_connector_metadata + assert kv_connector_metadata is not None + assert isinstance(kv_connector_metadata, LLMDataDistCMgrConnectorMetadata) + + assert len(kv_connector_metadata.requests) == 1 + assert request_id in kv_connector_metadata.requests + req_meta = kv_connector_metadata.requests[request_id] + + for block_id, block in zip( + req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks[request_id]): + assert block_id == block.block_id diff --git a/tests/ut/kv_connector/test_remote_decode_lifecycle.py b/tests/ut/kv_connector/test_remote_decode_lifecycle.py new file mode 100644 index 0000000000..d321490f65 --- /dev/null +++ b/tests/ut/kv_connector/test_remote_decode_lifecycle.py @@ -0,0 +1,123 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/blob/main/tests/conftest.py +# + +from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT +from vllm.v1.request import FinishReason, RequestStatus + +from tests.ut.kv_connector.utils import (assert_scheduler_empty, + create_model_runner_output, + create_request, create_scheduler, + create_vllm_config) + + +def test_basic_lifecycle(): + """Test lifecycle of a Remote Decode request.""" + + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + + # 2 Full Blocks and 1 Half Block. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 2 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + + request = create_request(request_id=1, + max_tokens=1, + num_tokens=NUM_TOKENS, + do_remote_decode=True) + + scheduler.add_request(request) + request_id = request.request_id + + # STEP (1): Prefill. + # (1a): schedule() + scheduler_output = scheduler.schedule() + assert len(scheduler.running) == 1 + assert len(scheduler_output.scheduled_new_reqs) == 1 + + # (1b): execute_model() + model_runner_output = create_model_runner_output(reqs=[request]) + + # (1c): update_from_output() + engine_core_outputs = scheduler.update_from_output(scheduler_output, + model_runner_output) + + # Ensure the request is finished after 1 tokens. + assert request.is_finished() + assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED + output = engine_core_outputs[0].outputs[0] + assert output.finish_reason == FinishReason.LENGTH + assert output.kv_transfer_params is not None + + # Request freed in Scheduler and blocks should be freed + assert request_id in scheduler.finished_req_ids + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 0 + + scheduler.schedule() + assert_scheduler_empty(scheduler) + + +def test_prefix_cache_lifecycle(): + """Test that remote decode params still works with a prefix cache hit.""" + + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + + # Prime the KVCache. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 3 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + + request_remote_a = create_request(request_id=1, + max_tokens=1, + num_tokens=NUM_TOKENS, + do_remote_decode=True) + + scheduler.add_request(request_remote_a) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_remote_a], + use_eos=True) + scheduler.update_from_output(scheduler_output, model_runner_output) + scheduler.schedule() + scheduler.update_from_output(scheduler_output, EMPTY_MODEL_RUNNER_OUTPUT) + + ##################### + # Actual Test: confirm we send all blocks. + + # Send the KV Transfer. + NUM_EXTERNAL_FULL_BLOCKS -= 1 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + + request_remote_b = create_request(request_id=1, + num_tokens=NUM_TOKENS, + do_remote_decode=True) + + scheduler.add_request(request_remote_b) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_remote_b]) + eco = scheduler.update_from_output(scheduler_output, model_runner_output) + kv_transfer_params = eco[0].outputs[0].kv_transfer_params + # Ensure we send all block ids, even if there is a cache hit. + assert (len( + kv_transfer_params["remote_block_ids"]) == (NUM_EXTERNAL_FULL_BLOCKS + + 1)) + + scheduler.schedule() + assert_scheduler_empty(scheduler) diff --git a/tests/ut/kv_connector/test_remote_prefill_lifecycle.py b/tests/ut/kv_connector/test_remote_prefill_lifecycle.py new file mode 100644 index 0000000000..c8629d5993 --- /dev/null +++ b/tests/ut/kv_connector/test_remote_prefill_lifecycle.py @@ -0,0 +1,242 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/blob/main/tests/conftest.py +# +import copy + +from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT +from vllm.v1.request import FinishReason, RequestStatus + +from tests.ut.kv_connector.utils import (assert_scheduler_empty, + create_model_runner_output, + create_request, create_scheduler, + create_vllm_config) + + +def test_basic_lifecycle(): + """Test lifecycle of a remote prefill.""" + + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + + # 2 Full Blocks and 1 Half Block. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 2 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + START_FREE_BLOCK_QUEUE_SIZE = ( + scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) + + request = create_request(request_id=1, + num_tokens=NUM_TOKENS, + do_remote_prefill=True) + + scheduler.add_request(request) + request_id = request.request_id + + # STEP (1): + # (1a): schedule() + scheduler_output = scheduler.schedule() + + # Nothing running and empty scheduler output. + assert len(scheduler.running) == 0 + assert len(scheduler_output.scheduled_new_reqs) == 0 + assert len(scheduler_output.scheduled_cached_reqs) == 0 + assert len(scheduler_output.num_scheduled_tokens) == 0 + assert scheduler_output.total_num_scheduled_tokens == 0 + + # Req waiting for KVs with no computed/scheduled toks ... + assert len(scheduler.waiting) == 1 + assert request in scheduler.waiting + assert (request.status == RequestStatus.WAITING_FOR_REMOTE_KVS) + assert (request.num_computed_tokens == 0) + + # ... but should have (uncached) blocks allocated to it. + block_pool = scheduler.kv_cache_manager.block_pool + assert (block_pool.free_block_queue.num_free_blocks < + START_FREE_BLOCK_QUEUE_SIZE) + assert len(block_pool.cached_block_hash_to_block) == 0 + blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[ + 0].req_to_blocks[request_id] + for block in blocks: + assert block._block_hash is None + + # (1b): forward() + model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT + + # (1c): update_from_output() + engine_core_outputs = scheduler.update_from_output(scheduler_output, + model_runner_output) + assert not engine_core_outputs or not engine_core_outputs[0].outputs + + # STEP (2): + # (2a): schedule(): nothing happens! + scheduler_output = scheduler.schedule() + assert len(scheduler.waiting) == 1 + assert len(scheduler.running) == 0 + + # (2b): forward(): request finishes recv. + model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) + model_runner_output.finished_recving = [request_id] + + # (2c): update_from_output(): + engine_core_outputs = scheduler.update_from_output(scheduler_output, + model_runner_output) + assert len(scheduler.waiting) == 1 + assert (request_id in scheduler.finished_recving_kv_req_ids) + + # STEP (3): + # (3a): schedule(): this should actually schedule. + scheduler_output = scheduler.schedule() + assert len(scheduler.running) == 1 + + # Confirm the block are actually allocated. + num_hashed_blocks = 0 + blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[ + 0].req_to_blocks[request_id] + for block in blocks: + assert block.ref_cnt == 1 + num_hashed_blocks += (1 if block._block_hash is not None else 0) + assert num_hashed_blocks == NUM_EXTERNAL_FULL_BLOCKS + + # Confirm the rest of the prompt is scheduled in this step. + scheduled_req = scheduler_output.scheduled_new_reqs[0] + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[request_id] + num_computed_tokens = scheduled_req.num_computed_tokens + total_prompt_tokens = len(scheduled_req.prompt_token_ids) + assert (num_scheduled_tokens == total_prompt_tokens - num_computed_tokens) + + # (3b): execute_model() + model_runner_output = create_model_runner_output([request]) + # (3c): update_from_output() + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Step (4): Hit EOS. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output([request], use_eos=True) + engine_core_outputs = scheduler.update_from_output(scheduler_output, + model_runner_output) + scheduler.schedule() + + outputs = engine_core_outputs[0].outputs + assert len(outputs) == 1 + output = outputs[0] + assert output.finish_reason == FinishReason.STOP + assert_scheduler_empty(scheduler) + + +def test_no_spurious_prefix_caching(): + """ + With P/D, blocks can be allocated but uncomputed for + multiple engine steps. This test confirms that we do + not accidentally have cache hits against uncomputed + blocks. + """ + + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + + # 2 and a half full external blocks. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 2 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + + # Both of these requests have prompts like [1,1,1,1,1, ...] + request_remote = create_request( + request_id=1, + num_tokens=NUM_TOKENS, + do_remote_prefill=True, + use_all_1s_for_prompt_tokens=True, + ) + + # Schedule the remote prefill request. This should not + # cause any blocks to be cached. + scheduler.add_request(request_remote) + scheduler_output = scheduler.schedule() + scheduler.update_from_output(scheduler_output, EMPTY_MODEL_RUNNER_OUTPUT) + assert len(scheduler.waiting) == 1 + + remote_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[ + 0].req_to_blocks[request_remote.request_id] + + # Remote blocks should not be cached. + for block in remote_blocks: + assert block.ref_cnt == 1 + assert block._block_hash is None + + +def test_full_block_prompt(): + """Test that we handle a prompt that is the full block size.""" + + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + + # 2 Full Blocks and 1 Half Block. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 2 + NUM_TOKENS = int(BLOCK_SIZE * NUM_EXTERNAL_FULL_BLOCKS) + + request = create_request(request_id=1, + num_tokens=NUM_TOKENS, + do_remote_prefill=True) + + scheduler.add_request(request) + request_id = request.request_id + + # STEP (1): Initialize a recv. + scheduler_output = scheduler.schedule() + # All blocks should be allocated. + num_blocks = len(scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks[request_id]) + assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS + model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT + scheduler.update_from_output(scheduler_output, model_runner_output) + + # # STEP (2): Recv. + scheduler_output = scheduler.schedule() + model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) + model_runner_output.finished_recving = [request_id] + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.waiting) == 1 + assert (request_id in scheduler.finished_recving_kv_req_ids) + + # # STEP (3): Run as usual. + scheduler_output = scheduler.schedule() + + # We need to recompute the final token of the prompt to generate + # the first new token, so we should not have a new block. + num_blocks = len(scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks[request_id]) + assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS + assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens == + NUM_TOKENS - 1) + assert (scheduler_output.num_scheduled_tokens[request_id] == 1) + + model_runner_output = create_model_runner_output([request]) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # # Step (4): Hit EOS. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output([request], use_eos=True) + engine_core_outputs = scheduler.update_from_output(scheduler_output, + model_runner_output) + scheduler.schedule() + + outputs = engine_core_outputs[0].outputs + assert len(outputs) == 1 + output = outputs[0] + assert output.finish_reason == FinishReason.STOP + assert_scheduler_empty(scheduler) diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py new file mode 100644 index 0000000000..a8f65ff429 --- /dev/null +++ b/tests/ut/kv_connector/utils.py @@ -0,0 +1,194 @@ +# SPDX-License-Identifier: Apache-2.0 +# This code is from: https://github.com/vllm-project/vllm/tests/v1/kv_connector/unit/utils.py +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. + +import os +from typing import Any, Optional + +import torch +from vllm import SamplingParams +from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig, + ModelConfig, SchedulerConfig, VllmConfig) +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec) +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.request import Request +from vllm.v1.structured_output import StructuredOutputManager + +EOS_TOKEN_ID = 50256 +os.environ["VLLM_USE_V1"] = "1" + + +def assert_scheduler_empty(scheduler: Scheduler): + """Confirm the scheduler is "empty" - i.e. no leaks.""" + # Scheduler Metadata. + assert len(scheduler.requests) == 0 + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == 0 + assert len(scheduler.finished_req_ids) == 0 + assert len(scheduler.finished_recving_kv_req_ids) == 0 + assert len(scheduler._cached_reqs_data) == 0 + + # EncoderCacheManager. + assert len(scheduler.encoder_cache_manager.freed) == 0 + assert len(scheduler.encoder_cache_manager.cached) == 0 + + # KVCache Manager. + assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. + req_to_blocks) == 0 + assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0 + assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. + num_cached_block) == 0 + num_free_blocks = ( + scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) + assert num_free_blocks == ( + scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1) + + # NOTE(rob): just the ref count on blocks will be 0. The hash + # value, etc will remain since we lazily evict for prefix cache. + for block in scheduler.kv_cache_manager.block_pool.blocks: + assert block.ref_cnt == 0 + + +def create_vllm_config( + model: str = "facebook/opt-125m", + max_num_seqs: int = 16, + max_num_batched_tokens: int = 1024, + block_size: int = 128, +) -> VllmConfig: + """Initialize VllmConfig For Testing.""" + scheduler_config = SchedulerConfig( + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + max_model_len=max_num_batched_tokens, + ) + model_config = ModelConfig( + model=model, + task="auto", + tokenizer=model, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="float16", + seed=42, + ) + # Cache config, optionally force APC + cache_config = CacheConfig( + block_size=block_size, + gpu_memory_utilization=0.9, + swap_space=0, + cache_dtype="auto", + enable_prefix_caching=True, + ) + kv_transfer_config = KVTransferConfig( + kv_connector="LLMDataDistCMgrConnector", + kv_role="kv_both", + kv_connector_module_path= + "vllm_ascend.distributed.llmdatadist_connector_v1_a3") + return VllmConfig(scheduler_config=scheduler_config, + model_config=model_config, + cache_config=cache_config, + kv_transfer_config=kv_transfer_config, + device_config=DeviceConfig("cpu")) + + +def create_scheduler( + vllm_config: VllmConfig, + num_blocks: int = 10000, +) -> Scheduler: + """Initialize Scheduler For Testing.""" + block_size = vllm_config.cache_config.block_size + kv_cache_config = KVCacheConfig( + num_blocks=num_blocks, # A large number of blocks to hold all requests + kv_cache_tensors=[], + kv_cache_groups=[ + KVCacheGroupSpec(['layer'], + FullAttentionSpec(block_size, 1, 1, torch.float16, + False)) + ], + ) + vllm_config.cache_config.num_gpu_blocks = num_blocks + return Scheduler( + vllm_config=vllm_config, + kv_cache_config=kv_cache_config, + log_stats=True, + structured_output_manager=StructuredOutputManager(vllm_config), + ) + + +def create_request( + request_id: int, + num_tokens: int = 10, + max_tokens: int = 128, + do_remote_decode: bool = False, + do_remote_prefill: bool = False, + use_all_1s_for_prompt_tokens: bool = False, + num_remote_blocks: int = 3, +) -> Request: + """Make dummy request for testing.""" + + kv_transfer_params: Optional[dict[str, Any]] = None + + if do_remote_decode: + assert not do_remote_prefill + kv_transfer_params = dict(do_remote_prefill=False, + do_remote_decode=True) + elif do_remote_prefill: + kv_transfer_params = dict(do_remote_prefill=True, + do_remote_decode=False, + remote_engine_id="my-engine-id", + remote_block_ids=list( + range(num_remote_blocks)), + remote_host="my-host", + remote_port=1234, + remote_tp_size=1) + + max_tokens = 1 if do_remote_decode else max_tokens + sampling_params = SamplingParams(max_tokens=max_tokens) + + if use_all_1s_for_prompt_tokens: + prompt_token_ids = [1] * num_tokens + else: + prompt_token_ids = [i * request_id for i in range(num_tokens)] + + req = Request( + request_id=f"id-{request_id}", + prompt_token_ids=prompt_token_ids, + sampling_params=sampling_params, + multi_modal_inputs=None, + multi_modal_placeholders=None, + multi_modal_hashes=None, + eos_token_id=EOS_TOKEN_ID, + ) + req.kv_transfer_params = kv_transfer_params + return req + + +def create_model_runner_output( + reqs: list[Request], + finished_sending: Optional[list[str]] = None, + finished_recving: Optional[list[str]] = None, + use_eos: bool = False, +) -> ModelRunnerOutput: + """Make dummy model runner output for testing.""" + + # Make request data. + req_ids = [req.request_id for req in reqs] + req_id_to_index = {req_id: idx for idx, req_id in enumerate(req_ids)} + + # Make sampled tokens. + sampled_token = EOS_TOKEN_ID if use_eos else 0 + sampled_token_ids = [[sampled_token] for _ in req_ids] + + # Make output data structure. + return ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + finished_sending=finished_sending, + finished_recving=finished_recving, + ) diff --git a/tests/ut/ops/test_expert_load_balancer.py b/tests/ut/ops/test_expert_load_balancer.py deleted file mode 100644 index 3b7a69ddd4..0000000000 --- a/tests/ut/ops/test_expert_load_balancer.py +++ /dev/null @@ -1,147 +0,0 @@ -# fused moe ops test will hit the infer_schema error, we need add the patch -# here to make the test pass. -import vllm_ascend.patch.worker.patch_common.patch_utils # type: ignore[import] # isort: skip # noqa - -import json -import unittest -from typing import List, TypedDict -from unittest import mock - -import torch - -from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer - - -class Device(TypedDict): - device_id: int - device_expert: List[int] - - -class Layer(TypedDict): - layer_id: int - device_count: int - device_list: List[Device] - - -class MockData(TypedDict): - moe_layer_count: int - layer_list: List[Layer] - - -MOCK_DATA: MockData = { - "moe_layer_count": - 1, - "layer_list": [{ - "layer_id": - 0, - "device_count": - 2, - "device_list": [{ - "device_id": 0, - "device_expert": [7, 2, 0, 3, 5] - }, { - "device_id": 1, - "device_expert": [6, 1, 4, 7, 2] - }] - }] -} - - -class TestExpertLoadBalancer(unittest.TestCase): - - def setUp(self): - json_file = "expert_map.json" - with open(json_file, 'w') as f: - json.dump(MOCK_DATA, f) - - self.expert_load_balancer = ExpertLoadBalancer(json_file, - global_expert_num=8) - - def test_init(self): - - self.assertIsInstance(self.expert_load_balancer.expert_map_tensor, - torch.Tensor) - self.assertEqual(self.expert_load_balancer.layers_num, - MOCK_DATA["moe_layer_count"]) - self.assertEqual(self.expert_load_balancer.ranks_num, - MOCK_DATA["layer_list"][0]["device_count"]) - - def test_generate_index_dicts(self): - tensor_2d = torch.tensor([[7, 2, 0, 3, 5], [6, 1, 4, 7, 2]]) - result = self.expert_load_balancer.generate_index_dicts(tensor_2d) - expected_result = [{ - 7: 0, - 2: 1, - 0: 2, - 3: 3, - 5: 4 - }, { - 6: 5, - 1: 6, - 4: 7, - 7: 8, - 2: 9 - }] - self.assertEqual(result, expected_result) - - def test_generate_expert_placement_map(self): - expert_placement_map = self.expert_load_balancer.generate_expert_placement_map( - ) - self.assertEqual(expert_placement_map.shape, - (self.expert_load_balancer.layers_num, - self.expert_load_balancer.ranks_num, 8)) - self.assertTrue(torch.all(expert_placement_map >= -1)) - - def test_generate_log2phy_expert_map(self): - layer_id = 0 - log2phy_map = self.expert_load_balancer.generate_log2phy_expert_map( - layer_id) - self.assertEqual(log2phy_map.shape, - (self.expert_load_balancer.ranks_num, 8)) - self.assertTrue(torch.all(log2phy_map >= -1)) - - @mock.patch("torch_npu.npu._lazy_init") - @mock.patch("torch.npu.current_device", return_value="cpu") - def test_get_rank_placement_map(self, mock_current_device, mock_lazy_init): - layer_id = 0 - rank_id = 0 - rank_local_expert_num, rank_expert_map = self.expert_load_balancer.get_rank_placement_map( - layer_id, rank_id) - self.assertEqual(rank_local_expert_num, 5) - expected_tensor = torch.tensor([2, -1, 1, 3, -1, 4, -1, 0], - dtype=torch.int32).to( - rank_expert_map.device) - self.assertTrue(rank_expert_map.equal(expected_tensor)) - - rank_id = 1 - rank_local_expert_num, rank_expert_map = self.expert_load_balancer.get_rank_placement_map( - layer_id, rank_id) - expected_tensor = torch.tensor([-1, 1, 4, -1, 2, -1, 0, 3], - dtype=torch.int32).to( - rank_expert_map.device) - self.assertTrue(rank_expert_map.equal(expected_tensor)) - - def test_get_rank_log2phy_map(self): - layer_id = 0 - rank_id = 0 - log2phy_map = self.expert_load_balancer.get_rank_log2phy_map( - layer_id, rank_id) - expected_tensor = torch.tensor([2, 6, 1, 3, 7, 4, 5, 0], - dtype=torch.int32).to( - log2phy_map.device) - self.assertTrue(log2phy_map.equal(expected_tensor)) - - rank_id = 1 - log2phy_map = self.expert_load_balancer.get_rank_log2phy_map( - layer_id, rank_id) - expected_tensor = torch.tensor([2, 6, 9, 3, 7, 4, 5, 8], - dtype=torch.int32).to( - log2phy_map.device) - self.assertTrue(log2phy_map.equal(expected_tensor)) - - def test_get_global_redundant_expert_num(self): - redundant_expert_num = self.expert_load_balancer.get_global_redundant_expert_num( - ) - expected_redundant_expert_num = len(MOCK_DATA["layer_list"][0]["device_list"][0]["device_expert"]) * \ - MOCK_DATA["layer_list"][0]["device_count"] - 8 - self.assertEqual(redundant_expert_num, expected_redundant_expert_num) diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py deleted file mode 100644 index 5ec4dd72cc..0000000000 --- a/tests/ut/test_ascend_config.py +++ /dev/null @@ -1,244 +0,0 @@ -import os -import unittest -from unittest import mock - -from transformers import PretrainedConfig -from vllm.config import ModelConfig, VllmConfig - -from vllm_ascend.ascend_config import (check_ascend_config, - clear_ascend_config, get_ascend_config, - init_ascend_config) - - -class TestAscendConfig(unittest.TestCase): - - @staticmethod - def _clean_up_ascend_config(func): - - def wrapper(*args, **kwargs): - clear_ascend_config() - func(*args, **kwargs) - clear_ascend_config() - - return wrapper - - @_clean_up_ascend_config - def test_init_ascend_config_without_additional_config(self): - test_vllm_config = VllmConfig() - # No additional config given, check the default value here. - ascend_config = init_ascend_config(test_vllm_config) - self.assertEqual(ascend_config.expert_tensor_parallel_size, 0) - self.assertIsNone(ascend_config.expert_map_path) - - torchair_graph_config = ascend_config.torchair_graph_config - self.assertFalse(torchair_graph_config.enabled) - self.assertFalse(torchair_graph_config.use_cached_graph) - self.assertEqual(torchair_graph_config.graph_batch_sizes, []) - self.assertFalse(torchair_graph_config.graph_batch_sizes_init) - self.assertFalse(torchair_graph_config.enable_multistream_mla) - self.assertFalse(torchair_graph_config.enable_multistream_moe) - self.assertTrue(torchair_graph_config.enable_view_optimize) - self.assertFalse(torchair_graph_config.enable_kv_nz) - - ascend_scheduler_config = ascend_config.ascend_scheduler_config - self.assertFalse(ascend_scheduler_config.enabled) - - @_clean_up_ascend_config - def test_init_ascend_config_with_additional_config(self): - test_vllm_config = VllmConfig() - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - "use_cached_graph": True, - "graph_batch_sizes": [1, 2, 4], - "graph_batch_sizes_init": False, - "enable_multistream_mla": True, - "enable_multistream_moe": True, - "enable_view_optimize": True, - "enable_kv_nz": True - }, - "ascend_scheduler_config": { - "enabled": True - }, - "expert_tensor_parallel_size": 1, - "expert_map_path": "test_expert_map_path", - "refresh": True - } - ascend_config = init_ascend_config(test_vllm_config) - self.assertEqual(ascend_config.expert_tensor_parallel_size, 1) - self.assertEqual(ascend_config.expert_map_path, "test_expert_map_path") - - torchair_graph_config = ascend_config.torchair_graph_config - self.assertTrue(torchair_graph_config.enabled) - self.assertTrue(torchair_graph_config.use_cached_graph) - self.assertEqual(torchair_graph_config.graph_batch_sizes, [1, 2, 4]) - self.assertFalse(torchair_graph_config.graph_batch_sizes_init) - self.assertTrue(torchair_graph_config.enable_multistream_mla) - self.assertTrue(torchair_graph_config.enable_multistream_moe) - self.assertTrue(torchair_graph_config.enable_view_optimize) - self.assertTrue(torchair_graph_config.enable_kv_nz) - - ascend_scheduler_config = ascend_config.ascend_scheduler_config - self.assertTrue(ascend_scheduler_config.enabled) - - @_clean_up_ascend_config - def test_init_ascend_config_with_refresh(self): - test_vllm_config = VllmConfig() - ascend_config = init_ascend_config(test_vllm_config) - self.assertFalse(ascend_config.torchair_graph_config.enabled) - - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - }, - } - ascend_config = init_ascend_config(test_vllm_config) - self.assertFalse(ascend_config.torchair_graph_config.enabled) - - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - }, - "refresh": True, - } - ascend_config = init_ascend_config(test_vllm_config) - self.assertTrue(ascend_config.torchair_graph_config.enabled) - - @_clean_up_ascend_config - def test_init_ascend_config_with_wrong_input(self): - test_vllm_config = VllmConfig() - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - "graph_batch_sizes": "fake_size", - }, - "refresh": True, - } - with self.assertRaises(TypeError): - init_ascend_config(test_vllm_config) - - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": False, - "graph_batch_sizes": [1, 2, 4, 8], - "graph_batch_sizes_init": True, - }, - "refresh": True, - } - with self.assertRaises(ValueError): - init_ascend_config(test_vllm_config) - - @_clean_up_ascend_config - def test_get_ascend_config(self): - test_vllm_config = VllmConfig() - ascend_config = init_ascend_config(test_vllm_config) - self.assertEqual(get_ascend_config(), ascend_config) - - @_clean_up_ascend_config - def test_get_ascend_config_without_init(self): - with self.assertRaises(RuntimeError): - get_ascend_config() - - @_clean_up_ascend_config - def test_clear_ascend_config(self): - test_vllm_config = VllmConfig() - ascend_config = init_ascend_config(test_vllm_config) - self.assertEqual(get_ascend_config(), ascend_config) - clear_ascend_config() - with self.assertRaises(RuntimeError): - get_ascend_config() - - @_clean_up_ascend_config - def test_check_ascend_config_pass(self): - test_vllm_config = VllmConfig() - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, False) - - # For V1 engine - with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, False) - - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": False, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, False) - - @_clean_up_ascend_config - def test_check_ascend_config_wrong_case(self): - test_vllm_config = VllmConfig() - # For V0 engine - with mock.patch.dict(os.environ, {"VLLM_USE_V1": "0"}): - with self.assertRaises(NotImplementedError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, False) - with self.assertRaises(NotImplementedError): - test_vllm_config.additional_config = { - "ascend_scheduler_config": { - "enabled": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, True) - # For V1 engine - with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}): - # torchair + eager mode - with self.assertRaises(RuntimeError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - }, - "refresh": True - } - init_ascend_config(test_vllm_config) - enforce_eager = True - check_ascend_config(test_vllm_config, enforce_eager) - # torchair + non deepseek model - with self.assertRaises(NotImplementedError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": True, - }, - "refresh": True - } - model_path = os.path.join(os.path.dirname(__file__), - "fake_weight") - fake_model_config = ModelConfig(model=model_path) - fake_model_config.hf_config = PretrainedConfig() - fake_model_config.hf_config.model_type = "llama" - test_vllm_config.model_config = fake_model_config - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, False) - # aclgraph + deepseek model - with self.assertRaises(NotImplementedError): - test_vllm_config.additional_config = { - "torchair_graph_config": { - "enabled": False, - }, - "refresh": True - } - model_path = os.path.join(os.path.dirname(__file__), - "fake_weight") - fake_model_config = ModelConfig(model=model_path) - fake_model_config.hf_config = PretrainedConfig() - fake_model_config.hf_config.model_type = "deepseek" - test_vllm_config.model_config = fake_model_config - init_ascend_config(test_vllm_config) - check_ascend_config(test_vllm_config, False) diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py deleted file mode 100644 index fdffa2a0fd..0000000000 --- a/tests/ut/worker/test_worker_v1.py +++ /dev/null @@ -1 +0,0 @@ -# placeholder diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index 31d3d2c529..66ad65ea7e 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -36,12 +36,12 @@ def __init__(self, vllm_config): self.ascend_scheduler_config = AscendSchedulerConfig( ascend_scheduler_config) - self.expert_tensor_parallel_size = int( - additional_config.get("expert_tensor_parallel_size", 0)) self.expert_map_path = additional_config.get("expert_map_path", None) - self.dynamic_eplb = additional_config.get("dynamic_eplb", False) self.chunked_prefill_for_mla = additional_config.get( "chunked_prefill_for_mla", False) + self.enable_weight_nz_layout = additional_config.get( + "enable_weight_nz_layout", False) + self.dynamic_eplb = additional_config.get("dynamic_eplb", False) class TorchairGraphConfig: @@ -139,6 +139,12 @@ def check_ascend_config(vllm_config, enforce_eager): else: # torchair_graph case if ascend_config.torchair_graph_config.enabled: + # torchair_graph is not supported for V1 without mla currently. + if envs.VLLM_MLA_DISABLE: + logger.warning( + "Torchair graph mode is still experimental and not supported for V1 without mla currently, " + "it has been disabled automatically.") + ascend_config.torchair_graph_config.enabled = False # torchair_graph is supported for deepseek model only currently. if vllm_config.model_config: model_type = vllm_config.model_config.hf_config.model_type diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py new file mode 100644 index 0000000000..75fd71c859 --- /dev/null +++ b/vllm_ascend/ascend_forward_context.py @@ -0,0 +1,79 @@ +from contextlib import contextmanager +from enum import Enum +from typing import Any, Optional + +import torch +from vllm.config import VllmConfig +from vllm.distributed import get_dp_group +from vllm.forward_context import get_forward_context, set_forward_context + + +class FusedMoEState(Enum): + AllGather = 0 + All2All = 1 + MC2 = 2 + + +# TODO(zzzzwwjj): add soc_version to choose branch +def get_fused_moe_state(ep_size: int, with_prefill: bool): + if ep_size == 1: + return FusedMoEState.AllGather + # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph. + elif ep_size < 16 or with_prefill: + return FusedMoEState.All2All + else: + return FusedMoEState.MC2 + + +@contextmanager +def set_ascend_forward_context( + attn_metadata: Any, + vllm_config: VllmConfig, + virtual_engine: int = 0, + num_tokens: Optional[int] = None, + num_tokens_across_dp: Optional[torch.Tensor] = None, + with_prefill: bool = True, + in_profile_run: bool = False): + """A context manager that stores the current forward context, + can be attention metadata, etc. + We add some additional param into forward_context. + """ + with set_forward_context(attn_metadata, + vllm_config, + virtual_engine=virtual_engine, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp): + forward_context = get_forward_context() + forward_context.with_prefill = with_prefill + + ep_size = torch.distributed.get_world_size( + ) if vllm_config.parallel_config.enable_expert_parallel else 1 + + fused_moe_state = get_fused_moe_state(ep_size, with_prefill) + + forward_context.fused_moe_state = fused_moe_state + + forward_context.in_profile_run = in_profile_run + + # NOTE: This cannot be set using set_forward_context + # due to multiple warmups before actual capturing + forward_context.capturing = False + + dp_world_size = get_dp_group().world_size + if dp_world_size > 1 and forward_context.dp_metadata is not None: + forward_context.max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item( + ) + elif num_tokens is not None: + forward_context.max_tokens_across_dp = num_tokens + elif attn_metadata is not None: + if hasattr(attn_metadata, 'num_actual_tokens'): + forward_context.max_tokens_across_dp = attn_metadata.num_actual_tokens + else: + forward_context.max_tokens_across_dp = attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens + else: + forward_context.max_tokens_across_dp = None + + try: + yield + finally: + pass diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py index 9c112ed620..3c400ad14f 100644 --- a/vllm_ascend/attention/attention.py +++ b/vllm_ascend/attention/attention.py @@ -40,8 +40,6 @@ from vllm_ascend.worker.model_runner import ( ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata) -_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128] - def generate_attn_mask(max_seq_len: int, dtype=torch.float16, mask_value=None): # Construct lower triangle matrix. @@ -1007,15 +1005,6 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - # TODO: support numHeads / numKvHeads < 16 in MLA kernel - if self.torchair_graph_enabled: - assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \ - ("The allowed number of queries per kv when enabling both MLA and Graph mode" - " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite," - " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1," - " please make sure after the tensor parallel split, num_heads / num_kv_heads in " - "{32, 64, 128}.") - def exec_kv( self, hidden_states: torch.Tensor, diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index e6a2376786..3417bb87fb 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -24,12 +24,16 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionLayer, AttentionType) from vllm.attention.backends.utils import CommonAttentionState +from vllm.config import get_current_vllm_config from vllm.forward_context import ForwardContext, get_forward_context from vllm.utils import direct_register_custom_op from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.worker.gpu_input_batch import InputBatch +from vllm_ascend.attention.utils import \ + AscendCommonAttentionMetadata as CommonAttentionMetadata from vllm_ascend.ops.attention import vanilla_chunked_prefill +from vllm_ascend.utils import get_graph_params class AscendAttentionBackend(AttentionBackend): @@ -114,6 +118,7 @@ class AscendMetadata: query_start_loc: torch.Tensor query_lens: torch.Tensor seq_lens: torch.Tensor + seq_lens_list: list # Maximum query length in the batch. None for decoding. max_query_len: Optional[int] = None # (num_tokens,). The indices of the token slots that input tokens will be @@ -133,7 +138,7 @@ class AscendMetadata: # For logging. num_input_tokens: int = 0 # Number of tokens including padding. - with_prefill_across_dp: bool = False + enable_dbo_across_dp: bool = False class AscendAttentionMetadataBuilder: @@ -149,23 +154,26 @@ def build(self, num_reqs, num_actual_tokens, max_query_len, - common_prefix_len, - with_prefill_across_dp: bool = False): + common_attn_metadata: CommonAttentionMetadata, + enable_dbo_across_dp: bool = False, + *args, + **kwargs): block_table = self.runner.input_batch.block_table[0].get_device_tensor( ) block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = ( block_table[:num_reqs]) - query_lens = self.runner.query_lens - seq_lens = self.runner.seq_lens_cpu[:num_reqs] - slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to( - self.runner.device, non_blocking=True) + query_start_loc = common_attn_metadata.query_start_loc + seq_lens = common_attn_metadata.seq_lens + # TODO: Refactor these two param to common metadata in runners, + # preparing for the hybrid KV groups feature + query_lens = common_attn_metadata.query_lens if common_attn_metadata.query_lens is not None else self.runner.query_lens + seq_lens_list = common_attn_metadata.seq_lens_list if common_attn_metadata.seq_lens_list is not None else self.runner.seq_lens_list + + slot_mapping = self.runner.slot_mapping[:num_actual_tokens] attn_mask = self.runner.attn_mask attn_state = self.runner.attn_state - query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1] - query_start_loc = query_start_loc_cpu.to(self.runner.device, - non_blocking=True) attn_metadata = AscendMetadata( num_actual_tokens=num_actual_tokens, @@ -173,11 +181,40 @@ def build(self, query_start_loc=query_start_loc, query_lens=query_lens, seq_lens=seq_lens, + seq_lens_list=seq_lens_list, max_query_len=max_query_len, slot_mapping=slot_mapping, attn_mask=attn_mask, attn_state=attn_state, - with_prefill_across_dp=with_prefill_across_dp) + enable_dbo_across_dp=enable_dbo_across_dp) + return attn_metadata + + def build_dummy_metadata(self, num_actual_tokens, num_reqs, + num_scheduled_tokens, attn_state): + if attn_state == AscendAttentionState.DecodeOnly: + # NOTE: We only need to pay attention to seq_lens_list and block_table here + common_attn_metadata = CommonAttentionMetadata(seq_lens_list=[2] * + num_reqs) + + block_table = self.runner.input_batch.block_table[0].block_table + block_table[:num_reqs, 0] = torch.arange(1, + num_reqs + 1, + device=block_table.device, + dtype=block_table.dtype) + + attn_metadata = self.build( + num_reqs=num_reqs, + num_actual_tokens=num_actual_tokens, + max_query_len=num_scheduled_tokens.max(), + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + ) + else: + raise NotImplementedError( + "Currently we only support building dummy metadata for DecodeOnly state" + ) + + attn_metadata.attn_state = attn_state return attn_metadata @@ -217,6 +254,10 @@ def __init__( self.key_cache = None self.value_cache = None + vllm_config = get_current_vllm_config() + self.full_graph = vllm_config.compilation_config.full_cuda_graph + self.block_size = vllm_config.cache_config.block_size + def forward( self, layer: AttentionLayer, @@ -228,21 +269,7 @@ def forward( output: Optional[torch.Tensor] = None, trace_flag: bool = True, ) -> torch.Tensor: - """Forward pass with Ascend attention. - Args: - query: shape = [batch_size, seq_len, num_heads * head_size] - key: shape = [batch_size, seq_len, num_kv_heads * head_size] - value: shape = [batch_size, seq_len, num_kv_heads * head_size] - kv_cache: shape = [2, num_blocks, block_size, - num_kv_heads, head_size] - key_cache = [num_blocks, block_size, - num_kv_heads, head_size] - value_cache = [num_blocks, block_size, - num_kv_heads, head_size] - attn_metadata: Metadata for attention. - Returns: - shape = [batch_size * seq_len, num_heads, head_size] - """ + """Forward pass with Ascend attention.""" num_tokens = query.shape[0] if output is None: output = torch.empty(num_tokens, @@ -275,7 +302,7 @@ def forward( # TODO: Remove this contiguous in the future. value = value.contiguous() - if kv_cache.numel() > 0: + if len(kv_cache) > 0: if self.key_cache is None: self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] slots = attn_metadata.slot_mapping @@ -307,11 +334,13 @@ def forward( assert attn_metadata is not None assert attn_metadata.attn_mask is not None compress_mask = attn_metadata.attn_mask + batch_size = attn_metadata.query_lens.shape[0] + block_table = attn_metadata.block_tables[:batch_size, :] torch_npu._npu_flash_attention_qlens( query=query, key_cache=self.key_cache, value_cache=self.value_cache, - block_table=attn_metadata.block_tables, + block_table=block_table, mask=compress_mask, seq_len=attn_metadata.query_lens, context_lens=attn_metadata.seq_lens, @@ -320,16 +349,92 @@ def forward( scale_value=self.scale, out=output) elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly: - torch_npu._npu_paged_attention( - query=query, - key_cache=self.key_cache, - value_cache=self.value_cache, - num_kv_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale_value=self.scale, - block_table=attn_metadata.block_tables, - context_lens=attn_metadata.seq_lens, - out=output) + if self.full_graph: + graph_params = get_graph_params() + q = query.view(num_tokens, -1, self.hidden_size) + k = self.key_cache.view( # type: ignore + -1, self.block_size, + self.num_kv_heads * self.head_size) + v = self.value_cache.view( # type: ignore + -1, self.block_size, + self.num_kv_heads * self.head_size) + actual_seq_lens = attn_metadata.seq_lens_list + attn_args = { + "query": q, + "key": k, + "value": v, + "actual_seq_lengths_kv": actual_seq_lens, + "block_table": attn_metadata.block_tables, + "num_heads": self.num_heads, + "scale": self.scale, + "input_layout": "BSH", + "num_key_value_heads": self.num_kv_heads, + "block_size": self.block_size, + } + + # Prepare tensors for attention output + # TODO: Refactor this to step-level instead of layer-level + attn_output = torch.empty(num_tokens, + 1, + self.hidden_size, + dtype=output.dtype, + device=output.device) + softmax_lse = torch.empty(num_tokens, + dtype=output.dtype, + device=output.device) + + # Get workspace from cache or calculate it if not present. + workspace = graph_params.workspaces.get(num_tokens) + if workspace is None: + workspace = torch_npu._npu_fused_infer_attention_score_get_max_workspace( + **attn_args) + graph_params.workspaces[num_tokens] = workspace + + forward_context = get_forward_context() + if not forward_context.capturing: + # Execute attention kernel directly in non-capturing mode + torch.ops.npu.npu_fused_infer_attention_score.out( + workspace=workspace, + out=[attn_output, softmax_lse], + **attn_args) + else: + # Handle graph capturing mode + stream = torch_npu.npu.current_stream() + + event = torch.npu.ExternalEvent() + event.wait(stream) + event.reset(stream) + graph_params.events[num_tokens].append(event) + + graph_params.attn_params[num_tokens].append( + (q, k, v, actual_seq_lens, + attn_metadata.block_tables, self.num_heads, + self.scale, self.num_kv_heads, attn_output, + softmax_lse)) + + torch.npu.graph_task_group_begin(stream) + torch.ops.npu.npu_fused_infer_attention_score.out( + workspace=workspace, + out=[attn_output, softmax_lse], + **attn_args) + handle = torch.npu.graph_task_group_end(stream) + graph_params.handles[num_tokens].append(handle) + + # Reshape output to match the expected format + output.copy_( + attn_output.view(num_tokens, self.num_heads, + self.head_size)) + else: + torch_npu._npu_paged_attention( + query=query, + key_cache=self.key_cache, + value_cache=self.value_cache, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + block_table=attn_metadata.block_tables, + context_lens=attn_metadata.seq_lens, + out=output) # Normal V1 situation. else: # use chunked prefill for head size 192 scenario, like deepseek diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 189aa38e89..816d93c028 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -13,34 +13,23 @@ UnquantizedLinearMethod) from vllm.utils import cdiv, round_down +from vllm_ascend import envs from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.attention.utils import \ + AscendCommonAttentionMetadata as CommonAttentionMetadata from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig from vllm_ascend.multistream.context import get_multistream_comm_context from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla -from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, npu_stream_switch, + npu_wait_tensor) if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.worker.gpu_input_batch import InputBatch -@dataclass -class CommonAttentionMetadata: - """ - Attention metadata attributes that can be shared by layers in different KV - cache groups and thus having different block table. - """ - - query_start_loc: torch.Tensor - """(batch_size + 1,), the start location of each request in query Tensor""" - seq_lens: torch.Tensor - """(batch_size,), the length of each request including both computed tokens - and newly scheduled tokens""" - - class AscendMLABackend(AttentionBackend): accept_output_buffer: bool = True @@ -103,6 +92,7 @@ class AscendMLADecodeMetadata: seq_lens: torch.Tensor max_seq_lens: int seq_lens_list: list[int] + actual_seq_q_lens: Optional[list[int]] = None attn_mask: Optional[torch.Tensor] = None @@ -136,8 +126,8 @@ class AscendMLAMetadata: # For logging. num_input_tokens: int = 0 # Number of tokens including padding. - max_num_tokens_across_dp: int = 0 - with_prefill_across_dp: bool = False + enable_dbo_across_dp: bool = False + is_mtp_model: bool = False query_lens: Optional[list[int]] = None # The dimension of the attention heads @@ -290,7 +280,7 @@ def _get_graph_runner_block_tables( self, num_seqs: int, block_tables: torch.Tensor) -> torch.Tensor: max_batch_size, max_blocks = self.runner.graph_block_tables.shape - assert max_batch_size >= num_seqs + assert max_batch_size >= num_seqs, f"max_batch_size: {max_batch_size} should be bigger than cur_num_seqs: {num_seqs}" if isinstance(self.runner.graph_block_tables, np.ndarray): graph_block_tables = torch.zeros((max_batch_size, max_blocks), @@ -312,8 +302,12 @@ def _get_graph_runner_block_tables( return graph_block_tables[:num_seqs, :max_blocks] - def build_dummy(self, num_reqs: int, - num_actual_tokens: int) -> AscendMLAMetadata: + def build_torchair_graph_dummy( + self, + num_reqs: int, + num_actual_tokens: int, + is_mtp_model: bool = False, + ) -> AscendMLAMetadata: device = self.runner.device _, max_blocks = self.runner.graph_block_tables.shape block_table = torch.zeros((num_reqs, max_blocks), @@ -321,11 +315,13 @@ def build_dummy(self, num_reqs: int, device=device) block_table = self._get_graph_runner_block_tables( num_reqs, block_table) - seq_lens = torch.ones(num_reqs, dtype=torch.int32, device=device) - input_positions = torch.zeros(num_reqs, + num_tokens = num_reqs * self.runner.decode_token_per_req + seq_lens = torch.zeros(num_reqs, dtype=torch.int32, device=device) + seq_lens_list = seq_lens.tolist() + input_positions = torch.zeros(num_tokens, dtype=torch.int32, device=device).long() - slot_mapping = torch.full((num_reqs, ), + slot_mapping = torch.full((num_tokens, ), PAD_SLOT_ID, dtype=torch.int32, device=device) @@ -333,28 +329,38 @@ def build_dummy(self, num_reqs: int, -1, dtype=torch.int32, device=device) + if self.runner.speculative_config is not None and\ + self.runner.speculative_config.method == 'deepseek_mtp' and not is_mtp_model: + attn_state = AscendAttentionState.SpecDecoding + num_decode_tokens = 2 + else: + attn_state = AscendAttentionState.DecodeOnly + num_decode_tokens = 1 decode_metadata = AscendMLADecodeMetadata( input_positions=input_positions, block_table=block_table, seq_lens=seq_lens, - seq_lens_list=seq_lens.tolist(), + seq_lens_list=seq_lens_list, max_seq_lens=1, - attn_mask=self.runner.spec_attn_mask) + attn_mask=self.runner.spec_attn_mask, + actual_seq_q_lens=self.runner.actual_seq_q_lens[:num_reqs], + ) return self.metadata_cls( # type: ignore num_input_tokens=num_actual_tokens, num_actual_tokens=num_actual_tokens, slot_mapping=slot_mapping, head_dim=self.runner.model_config.get_head_size(), num_decodes=1, - num_decode_tokens=1, + num_decode_tokens=num_decode_tokens, num_prefills=0, attn_mask=self.runner.attn_mask, - attn_state=AscendAttentionState.DecodeOnly, + attn_state=attn_state, prefill=None, decode=decode_metadata, query_start_loc=query_start_loc, seq_lens=seq_lens, block_tables=block_table, + is_mtp_model=is_mtp_model, ) def build( @@ -364,9 +370,10 @@ def build( max_query_len: int, common_attn_metadata: CommonAttentionMetadata, common_prefix_len: Optional[int] = None, - graph_pad_size: int = -1, - max_num_tokens_across_dp: int = 0, - with_prefill_across_dp: bool = False, + num_token_pad_size: int = -1, + num_reqs_pad_size: int = 0, + enable_dbo_across_dp: bool = False, + is_mtp_model: bool = False, ) -> AscendMLAMetadata: assert self._num_decodes + self._num_prefills == num_reqs @@ -450,8 +457,9 @@ def build( ) decode_metadata = None - use_torchair_graph = graph_pad_size != -1 + use_torchair_graph = num_token_pad_size != -1 if self._num_decodes > 0: + actual_seq_q_lens = None max_seq_lens = seq_lens[:self._num_decodes].max().item() seq_lens = seq_lens[:self._num_decode_tokens] input_positions = input_positions[:self._num_decode_tokens] @@ -460,41 +468,48 @@ def build( AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding ]: - num_seqs = len(seq_lens) - if graph_pad_size != 0: - pad_value = 1 - padded_seq_lens = seq_lens.tolist() + [pad_value - ] * graph_pad_size + if num_token_pad_size != 0: + pad_value = 0 + padded_seq_lens = seq_lens.tolist( + ) + [pad_value] * num_reqs_pad_size else: padded_seq_lens = seq_lens.tolist() seq_lens = torch.from_numpy( np.array(padded_seq_lens).astype(np.int32)) - padding = torch.full((graph_pad_size, ), + seq_lens_list = padded_seq_lens + padding = torch.full((num_token_pad_size, ), PAD_SLOT_ID, dtype=slot_mapping.dtype, device=slot_mapping.device) slot_mapping = torch.cat([slot_mapping, padding]) block_table_padding = torch.zeros( - (graph_pad_size, ) + block_table.shape[1:], + (num_reqs_pad_size, ) + block_table.shape[1:], dtype=block_table.dtype, device=block_table.device) block_table = torch.cat([block_table, block_table_padding], dim=0) block_table = self._get_graph_runner_block_tables( - num_seqs + graph_pad_size, block_table) - padding_0 = torch.zeros(graph_pad_size, + num_reqs + num_reqs_pad_size, block_table) + padding_0 = torch.zeros(num_token_pad_size, dtype=input_positions.dtype, device=input_positions.device) input_positions = torch.cat([input_positions, padding_0]) + actual_seq_q_lens = query_start_loc[1:].tolist( + ) + self.runner.actual_seq_q_lens[num_reqs:num_reqs + + num_reqs_pad_size] + else: + seq_lens_list = seq_lens.tolist() decode_metadata = AscendMLADecodeMetadata( input_positions=input_positions, block_table=block_table, seq_lens=seq_lens, - seq_lens_list=seq_lens.tolist(), + seq_lens_list=seq_lens_list, max_seq_lens=max_seq_lens, - attn_mask=self.runner.spec_attn_mask) + attn_mask=self.runner.spec_attn_mask, + actual_seq_q_lens=actual_seq_q_lens, + ) return self.metadata_cls( # type: ignore num_actual_tokens=num_actual_tokens, @@ -511,8 +526,8 @@ def build( query_start_loc=query_start_loc, block_tables=block_table, seq_lens=seq_lens, - max_num_tokens_across_dp=max_num_tokens_across_dp, - with_prefill_across_dp=with_prefill_across_dp, + enable_dbo_across_dp=enable_dbo_across_dp, + is_mtp_model=is_mtp_model, ) @@ -570,15 +585,6 @@ def __init__( self.spec_token_num = speculative_config.num_speculative_tokens assert self.spec_token_num > 0 - # TODO: support numHeads / numKvHeads < 16 in MLA kernel - if self.torchair_graph_enabled: - assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \ - ("The allowed number of queries per kv when enabling both MLA and Graph mode" - " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite," - " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1," - " please make sure after the tensor parallel split, num_heads / num_kv_heads in " - "{32, 64, 128}.") - def _v_up_proj_and_o_proj(self, x): # Convert from (B, N, L) to (N, B, L) x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) @@ -651,20 +657,23 @@ def get_and_maybe_dequant_weights(layer: LinearBase): self.W_UV = W_UV.transpose(0, 1).contiguous() # Convert from (L, N, P) to (N, P, L) self.W_UK_T = W_UK.permute(1, 2, 0).contiguous() - - # Waiting for BMM NZ support - # self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data, 29) - # self.W_UK_T.data = torch_npu.npu_format_cast(self.W_UK_T.data, 29) + if get_ascend_config().enable_weight_nz_layout: + # cast quantized weight tensors in NZ layout for higher inference speed + self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data, + ACL_FORMAT_FRACTAL_NZ) + self.W_UK_T.data = torch_npu.npu_format_cast( + self.W_UK_T.data, ACL_FORMAT_FRACTAL_NZ) def _compute_prefill_context( self, query: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, + kv_c_and_k_pe_cache: Tuple[torch.Tensor], rope_dim: int, attn_metadata: AscendMLAMetadata, prefix_output: torch.Tensor, prefix_lse: torch.Tensor, ): + assert len(kv_c_and_k_pe_cache) > 1 prefill_metadata = attn_metadata.prefill if prefill_metadata is None or prefill_metadata.chunked_context is None: return prefix_output, prefix_lse @@ -674,21 +683,22 @@ def _compute_prefill_context( q_nope = query[..., :self.qk_nope_head_dim] seq_len1 = torch.tensor(prefill_metadata.query_lens, dtype=torch.int32) - latent_kv_dim = kv_c_and_k_pe_cache.size(3) - rope_dim - cache_kv_c = kv_c_and_k_pe_cache[:, :, :, :latent_kv_dim] - cache_k_pe = kv_c_and_k_pe_cache[:, :, :, latent_kv_dim:] + cache_kv_c = kv_c_and_k_pe_cache[0] + cache_k_pe = kv_c_and_k_pe_cache[1] + num_heads = cache_k_pe.size(2) + latent_kv_dim = kv_c_and_k_pe_cache[0].size(-1) for i in range(iters): toks = prefill_metadata.chunked_context.seq_tot[i] seq_len2 = prefill_metadata.chunked_context.chunk_seq_lens[i] seq_len = torch.stack([seq_len1, seq_len2]) kv_c_normed = torch.empty(toks, - kv_c_and_k_pe_cache.size(2), + num_heads, latent_kv_dim, dtype=query.dtype, device=query.device) k_pe = torch.empty(toks, - kv_c_and_k_pe_cache.size(2), + num_heads, rope_dim, dtype=query.dtype, device=query.device) @@ -738,10 +748,11 @@ def _forward_prefill( query: torch.Tensor, kv_c_normed: torch.Tensor, k_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, + kv_c_and_k_pe_cache: Tuple[torch.Tensor], attn_metadata: AscendMLAMetadata, ) -> torch.Tensor: assert attn_metadata.prefill is not None + assert len(kv_c_and_k_pe_cache) > 1 num_tokens = query.size(0) attn_output = torch.empty(num_tokens, @@ -758,7 +769,8 @@ def _forward_prefill( if attn_metadata.attn_state in [ AscendAttentionState.ChunkedPrefill, - AscendAttentionState.SpecDecoding + AscendAttentionState.SpecDecoding, + AscendAttentionState.PrefillCacheHit ] and not ascend_config.chunked_prefill_for_mla: attn_output_torch = torch.empty(num_tokens, self.num_heads * self.v_head_dim, @@ -783,7 +795,8 @@ def _forward_prefill( causal=True) elif attn_metadata.attn_state in [ AscendAttentionState.ChunkedPrefill, - AscendAttentionState.SpecDecoding + AscendAttentionState.SpecDecoding, + AscendAttentionState.PrefillCacheHit ]: attn_lse = torch.empty(self.num_heads, num_tokens, @@ -833,15 +846,12 @@ def _forward_prefill( num_kv_heads=self.num_heads, out=attn_output) attn_output = attn_output.view(-1, self.num_heads, self.v_head_dim) - else: - raise RuntimeError( - "Unexpected path reached, AscendMLAImpl should only have PrefillNoCache, ChunkedPrefill and SpecDecoding scenario in forward prefill, please file a bug to vllm-ascend !" - ) attn_output = attn_output.reshape( [num_tokens, self.num_heads * self.v_head_dim]) if attn_metadata.attn_state in [ AscendAttentionState.ChunkedPrefill, - AscendAttentionState.SpecDecoding + AscendAttentionState.SpecDecoding, + AscendAttentionState.PrefillCacheHit ] and not ascend_config.chunked_prefill_for_mla: attn_output = attn_output_torch @@ -934,44 +944,17 @@ def _forward_decode( q_pe: torch.Tensor, k_nope: torch.Tensor, k_pe: torch.Tensor, - kv_c_and_k_pe_cache: torch.Tensor, + kv_c_and_k_pe_cache: Tuple[torch.Tensor], attn_metadata: AscendMLAMetadata, ) -> torch.Tensor: decode_meta = attn_metadata.decode assert decode_meta is not None - - q = torch.cat([q_nope, q_pe], dim=-1) - num_tokens = q.size(0) - attn_output = torch.empty( - [num_tokens, self.num_heads, self.kv_lora_rank], - dtype=q.dtype, - device=q.device) + num_tokens = q_nope.size(0) if self.running_in_graph: - # TorchAir's shape is [bs, num_heads_per_rank, q_seq_len, dim] - if attn_metadata.attn_state == AscendAttentionState.SpecDecoding: - assert num_tokens % self.spec_token_num == 0 - q_nope = q_nope.view(num_tokens // (self.spec_token_num + 1), - self.spec_token_num + 1, self.num_heads, - -1) - q_pe = q_pe.view(num_tokens // (self.spec_token_num + 1), - self.spec_token_num + 1, self.num_heads, -1) - if not self.enable_kv_nz: - q_nope = q_nope.transpose(1, 2).contiguous() - q_pe = q_pe.transpose(1, 2).contiguous() - sparse_mode = 3 - spec_attn_mask = attn_metadata.decode.attn_mask # type:ignore - else: - if self.enable_kv_nz: - q_nope = q_nope.view(num_tokens, 1, self.num_heads, -1) - q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1) - else: - q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1) - q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1) - sparse_mode = 0 - spec_attn_mask = None # shape of knope/k_pe for npu graph mode should be: # [num_blocks, num_kv_heads, block_size, self.kv_lora_rank/self.qk_rope_head_dim] block_size = kv_c_and_k_pe_cache[0].shape[1] + actual_seq_lengths = None if self.enable_kv_nz: k_nope = k_nope.view(-1, self.num_kv_heads, self.kv_lora_rank // 16, block_size, 16) @@ -985,6 +968,26 @@ def _forward_decode( self.qk_rope_head_dim) input_layout = "BNSD" + # TorchAir's shape is [bs, num_heads_per_rank, q_seq_len, dim] + if attn_metadata.attn_state == AscendAttentionState.SpecDecoding: + assert num_tokens % self.spec_token_num == 0 + # [bs * q_seq_len, num_heads_per_rank, dim] + input_layout = "TND" + q_nope = q_nope.view(num_tokens, self.num_heads, -1) + q_pe = q_pe.view(num_tokens, self.num_heads, -1) + sparse_mode = 3 + spec_attn_mask = attn_metadata.decode.attn_mask # type:ignore + actual_seq_lengths = decode_meta.actual_seq_q_lens + else: + if self.enable_kv_nz: + q_nope = q_nope.view(num_tokens, 1, self.num_heads, -1) + q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1) + else: + q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1) + q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1) + sparse_mode = 0 + spec_attn_mask = None + attn_output, _ = torch_npu.npu_fused_infer_attention_score( q_nope, k_nope, @@ -1002,18 +1005,37 @@ def _forward_decode( block_table=decode_meta.block_table, block_size=block_size, actual_seq_lengths_kv=decode_meta.seq_lens_list, - ) + actual_seq_lengths=actual_seq_lengths) else: - torch_npu._npu_paged_attention_mla( - query=q, - key_cache=kv_c_and_k_pe_cache, - num_kv_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale_value=self.scale, - block_table=attn_metadata.decode.block_table, # type:ignore - context_lens=attn_metadata.decode.seq_lens, # type:ignore - mla_vheadsize=self.kv_lora_rank, - out=attn_output) + # The MLA_PA path will be used as default path in the future, `_npu_paged_attention_mla` will + # be removed after the torch_npu contains `torch_npu.atb.npu_multi_head_latent_attention` become + # public available + assert len(kv_c_and_k_pe_cache) > 1 + if envs.VLLM_ASCEND_MLA_PA: + attn_output = torch_npu.atb.npu_multi_head_latent_attention( + q_nope, q_pe, kv_c_and_k_pe_cache[0], + kv_c_and_k_pe_cache[1], attn_metadata.decode.block_table, + attn_metadata.decode.seq_lens, self.num_heads, self.scale, + self.num_kv_heads) + else: + q = torch.cat([q_nope, q_pe], dim=-1) + attn_output = torch.empty( + [num_tokens, self.num_heads, self.kv_lora_rank], + dtype=q.dtype, + device=q.device) + k_cache = torch.cat( + [kv_c_and_k_pe_cache[0], kv_c_and_k_pe_cache[1]], dim=-1) + torch_npu._npu_paged_attention_mla( + query=q, + key_cache=k_cache, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + block_table=attn_metadata.decode. + block_table, # type:ignore + context_lens=attn_metadata.decode.seq_lens, # type:ignore + mla_vheadsize=self.kv_lora_rank, + out=attn_output) current_ms_metadata = get_multistream_comm_context() if current_ms_metadata is None: return self._v_up_proj_and_o_proj(attn_output) @@ -1029,7 +1051,7 @@ def forward( hidden_states_or_q_c: torch.Tensor, # query in unified attn hidden_states_or_kv_c_normed: torch.Tensor, # key in unified attn k_pe: torch.Tensor, # value in unified attn - kv_cache: torch.Tensor, + kv_cache: Tuple[torch.Tensor], attn_metadata: M, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -1037,16 +1059,17 @@ def forward( if attn_metadata is None: # Profiling run. return output + # mtp model is not support for graph mode yet + self.torchair_graph_enabled = self.torchair_graph_enabled and not attn_metadata.is_mtp_model self.running_in_graph = self.torchair_graph_enabled and attn_metadata.attn_state in [ AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding ] num_actual_toks = attn_metadata.num_actual_tokens if k_pe is None and not self.running_in_graph: - if not self.torchair_graph_enabled: - kv_c, k_pe = self.kv_a_proj_with_mqa( - hidden_states_or_kv_c_normed)[0].split( - [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) + kv_c, k_pe = self.kv_a_proj_with_mqa( + hidden_states_or_kv_c_normed)[0].split( + [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) else: kv_c_normed = hidden_states_or_kv_c_normed assert attn_metadata.num_decodes is not None and \ @@ -1065,19 +1088,20 @@ def forward( if not self.running_in_graph: hidden_states_or_q_c = hidden_states_or_q_c[:num_actual_toks, ...] prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:] - if not self.torchair_graph_enabled: - decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens] - k_pe = k_pe[:num_actual_toks, ...] - k_pe = k_pe.unsqueeze(1) - decode_k_pe = k_pe[:num_decode_tokens] - prefill_k_pe = k_pe[num_decode_tokens:] + decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens] + prefill_hs = hidden_states_or_kv_c_normed[num_decode_tokens:] + # if not self.torchair_graph_enabled: + k_pe = k_pe[:num_actual_toks, ...] + k_pe = k_pe.unsqueeze(1) + decode_k_pe = k_pe[:num_decode_tokens] + prefill_k_pe = k_pe[num_decode_tokens:] else: decode_hs_or_q_c = hidden_states_or_q_c if has_decode: decode_k_nope = None assert attn_metadata.decode is not None if self.running_in_graph: - seq_len = self.rotary_emb.max_position_embeddings + seq_len = self.rotary_emb.max_position_embeddings * self.rotary_emb.scaling_factor cos = self.rotary_emb.cos_cached[:seq_len].to( dtype=decode_hs_or_q_c.dtype) sin = self.rotary_emb.sin_cached[:seq_len].to( @@ -1111,9 +1135,7 @@ def forward( else: decode_q_pe[...], decode_k_pe[...] = self.rotary_emb( attn_metadata.decode.input_positions, - decode_q_pe.contiguous(), - decode_k_pe, - max_seq_len=attn_metadata.decode.max_seq_lens) + decode_q_pe.contiguous(), decode_k_pe) if has_prefill: assert attn_metadata.prefill is not None prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\ @@ -1122,7 +1144,7 @@ def forward( prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim] if self.torchair_graph_enabled: num_tokens = prefill_hs_or_q_c.shape[0] - seq_len = self.rotary_emb.max_position_embeddings + seq_len = self.rotary_emb.max_position_embeddings * self.rotary_emb.scaling_factor cos = self.rotary_emb.cos_cached[:seq_len].to( dtype=prefill_q_pe.dtype) sin = self.rotary_emb.sin_cached[:seq_len].to( @@ -1134,22 +1156,24 @@ def forward( prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin) prefill_k_pe, prefill_k_nope = self.exec_kv_prefill( - hidden_states_or_kv_c_normed, cos, sin, kv_cache, - attn_metadata.slot_mapping) + prefill_hs, cos, sin, kv_cache, + attn_metadata.slot_mapping[num_decode_tokens:]) kv_c_normed = prefill_k_nope[:num_actual_toks, ...] - prefill_k_c_normed = prefill_k_nope[num_decode_tokens:] + prefill_k_c_normed = prefill_k_nope prefill_k_pe = prefill_k_pe.view(num_tokens, self.num_kv_heads, -1) prefill_q = torch.cat([prefill_q_nope, prefill_q_pe], dim=-1) else: prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb( attn_metadata.prefill.input_positions, - prefill_q_pe.contiguous(), - prefill_k_pe, - max_seq_len=attn_metadata.prefill.max_seq_lens) + prefill_q_pe.contiguous(), prefill_k_pe) + + assert len( + kv_cache + ) > 1, "the number of kv cache should be greater than 1, namely (nope_cache and rope_cache)" if self.torchair_graph_enabled: - if len(kv_cache) > 0 and kv_cache[0].numel( + if kv_cache[0].numel( ) > 0 and attn_metadata.attn_state == AscendAttentionState.PrefillNoCache: slots = attn_metadata.slot_mapping # NOTE: Separate the kv cache in advance to avoid OOM or other issues @@ -1159,16 +1183,15 @@ def forward( key_cache=kv_cache[0], value_cache=kv_cache[1], slot_indices=slots) - elif kv_cache.numel() > 0: - key = torch.cat([ - kv_c_normed.view([num_actual_toks, self.num_kv_heads, -1]), - k_pe - ], - dim=2) - torch_npu._npu_reshape_and_cache_siso( - key=key, - key_cache=kv_cache, - slot_indices=attn_metadata.slot_mapping.flatten()) + else: + kv_c_normed = kv_c_normed.view( + [num_actual_toks, self.num_kv_heads, -1]) + torch_npu._npu_reshape_and_cache( + key=kv_c_normed, + value=k_pe, + key_cache=kv_cache[0], + value_cache=kv_cache[1], + slot_indices=attn_metadata.slot_mapping) if has_prefill: # FIX: aicore move should be also placed on the comm stream in dbo, # otherwise it may affect the accuracy diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py new file mode 100644 index 0000000000..c2b7bc156a --- /dev/null +++ b/vllm_ascend/attention/utils.py @@ -0,0 +1,23 @@ +from dataclasses import dataclass +from typing import Optional + +import torch + + +@dataclass +class AscendCommonAttentionMetadata: + """ + Attention metadata attributes that can be shared by layers in different KV + cache groups and thus having different block table. + """ + + query_start_loc: torch.Tensor = None + """(batch_size + 1,), the start location of each request in query Tensor""" + seq_lens: Optional[torch.Tensor] = None + """(batch_size,), the length of each request including both computed tokens + and newly scheduled tokens""" + query_lens: Optional[torch.Tensor] = None + """(batch_size,), the length of each request including only the newly + scheduled tokens""" + seq_lens_list: Optional[list] = None + """(num_input_tokens,), note that this is specifically for FIA kernel""" diff --git a/vllm_ascend/compilation/piecewise_backend.py b/vllm_ascend/compilation/piecewise_backend.py index c6a800b3d8..aafe639373 100644 --- a/vllm_ascend/compilation/piecewise_backend.py +++ b/vllm_ascend/compilation/piecewise_backend.py @@ -28,9 +28,13 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.monitor import end_monitoring_torch_compile from vllm.config import VllmConfig +from vllm.forward_context import get_forward_context from vllm.logger import logger from vllm.utils import weak_ref_tensors +from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.utils import get_graph_params, set_graph_params + @dataclasses.dataclass class ConcreteSizeEntry: @@ -95,6 +99,10 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" + if self.compilation_config.full_cuda_graph: + self.update_stream = torch.npu.Stream() + set_graph_params(self.aclgraph_capture_sizes) + # the entries for different shapes that we need to either # compile or capture aclgraph self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {} @@ -116,7 +124,40 @@ def check_for_ending_compilation(self): self.vllm_backend.compiler_manager.save_to_file() end_monitoring_torch_compile(self.vllm_config) + def update_attn_params(self, graph_params, forward_context, runtime_shape): + for layer_idx in range(len(graph_params.handles[runtime_shape])): + query, key, value, actual_seq_lens, block_table, num_heads, scale, num_kv_heads, output, softmax_lse = graph_params.attn_params[ + runtime_shape][layer_idx] + block_table = forward_context.attn_metadata.block_tables + actual_seq_lens = forward_context.attn_metadata.seq_lens_list + + with torch.npu.stream(self.update_stream): + torch.npu.graph_task_update_begin( + self.update_stream, + graph_params.handles[runtime_shape][layer_idx]) + torch.ops.npu.npu_fused_infer_attention_score.out( + query, + key, + value, + workspace=graph_params.workspaces[runtime_shape], + actual_seq_lengths_kv=actual_seq_lens, + block_table=block_table, + num_heads=num_heads, + scale=scale, + input_layout="BSH", + num_key_value_heads=num_kv_heads, + block_size=128, + out=[output, softmax_lse], + ) + torch.npu.graph_task_update_end(self.update_stream) + + graph_params.events[runtime_shape][layer_idx].record( + self.update_stream) + def __call__(self, *args) -> Any: + forward_context = get_forward_context() + graph_params = get_graph_params() + if not self.first_run_finished: self.first_run_finished = True self.check_for_ending_compilation() @@ -127,6 +168,11 @@ def __call__(self, *args) -> Any: # we don't need to do anything for this shape return self.compiled_graph_for_general_shape(*args) + if (getattr(forward_context.attn_metadata, "attn_state", + None) != AscendAttentionState.DecodeOnly + and self.compilation_config.full_cuda_graph): + return self.compiled_graph_for_general_shape(*args) + entry = self.concrete_size_entries[runtime_shape] if entry.runnable is None: @@ -189,6 +235,7 @@ def __call__(self, *args) -> Any: patch("torch.npu.empty_cache", lambda: None)) # mind-exploding: carefully manage the reference and memory. + forward_context.capturing = True with torch.npu.graph(aclgraph, pool=self.graph_pool): # `output` is managed by pytorch's aclgraph pool output = entry.runnable(*args) @@ -222,4 +269,9 @@ def __call__(self, *args) -> Any: ) entry.aclgraph.replay() + + if self.compilation_config.full_cuda_graph: + self.update_attn_params(graph_params, forward_context, + runtime_shape) + return entry.output diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index 2fa31c264c..3f1477c9f9 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -23,7 +23,6 @@ from vllm.logger import logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.utils import cdiv -from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.engine import EngineCoreEventType, EngineCoreOutputs @@ -87,14 +86,11 @@ def skip_cur_request(): self.waiting.popleft() skipped_waiting_requests.appendleft(request) - num_prealloc_computed_tokens = 0 # P/D: skip request if still waiting for remote kvs. if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS: is_ready = self._update_waiting_for_remote_kv(request) if is_ready: request.status = RequestStatus.WAITING - num_prealloc_computed_tokens = ( - request.num_computed_tokens) else: skip_cur_request() continue @@ -112,8 +108,8 @@ def skip_cur_request(): load_kv_async = False # Get already-cached tokens. - if num_prealloc_computed_tokens == 0: - new_computed_blocks, num_native_computed_tokens = \ + if request.num_computed_tokens == 0: + new_computed_blocks, num_new_local_computed_tokens = \ self.kv_cache_manager.get_computed_blocks( request) @@ -121,18 +117,17 @@ def skip_cur_request(): if self.connector is not None: num_external_computed_tokens, load_kv_async = ( self.connector.get_num_new_matched_tokens( - request, num_native_computed_tokens)) + request, num_new_local_computed_tokens)) # Total computed tokens (local + external). - num_computed_tokens = (num_native_computed_tokens + + num_computed_tokens = (num_new_local_computed_tokens + num_external_computed_tokens) else: # P/D: skip checking prefix cache if loaded from remote kvs. - new_computed_blocks = KVCacheBlocks.create_empty() - num_native_computed_tokens = 0 - - # Total computed tokens (allocated in prior step). - num_computed_tokens = num_prealloc_computed_tokens + new_computed_blocks = ( + self.kv_cache_manager.create_empty_block_list()) + num_new_local_computed_tokens = 0 + num_computed_tokens = request.num_computed_tokens # P/D: loading remote KV, do not allocate for new work. if load_kv_async: @@ -142,9 +137,6 @@ def skip_cur_request(): # Number of tokens to be scheduled. else: prompt_limit = self._get_prompt_limit(request) - # Get already-cached tokens. - computed_blocks, num_computed_tokens = ( - self.kv_cache_manager.get_computed_blocks(request)) # We use `request.num_tokens` instead of # `request.num_prompt_tokens` to consider the resumed # requests, which have output tokens. @@ -172,7 +164,7 @@ def skip_cur_request(): skip_cur_request() continue assert num_new_tokens > 0 - blocks = computed_blocks.blocks[0] + blocks = new_computed_blocks.blocks[0] watermark = getattr(self.scheduler_config, "watermark", 0.01) if not self._check_watermark_for_prefill(request, num_new_tokens, @@ -184,8 +176,8 @@ def skip_cur_request(): new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens + num_external_computed_tokens, - num_native_computed_tokens, - new_computed_blocks=computed_blocks, + num_new_local_computed_tokens, + new_computed_blocks=new_computed_blocks, num_lookahead_tokens=self.num_lookahead_tokens, delay_cache_blocks=load_kv_async) if new_blocks is None: @@ -195,8 +187,7 @@ def skip_cur_request(): # KVConnector: update internal state after allocation. # This information is used to determine if a load is # needed for this request. - if num_external_computed_tokens: - assert self.connector is not None + if self.connector is not None: self.connector.update_state_after_alloc( request, new_computed_blocks + new_blocks, @@ -210,6 +201,7 @@ def skip_cur_request(): skipped_waiting_requests.appendleft(request) request.status = RequestStatus.WAITING_FOR_REMOTE_KVS continue + self.running.append(request) if self.log_stats: request.record_event(EngineCoreEventType.SCHEDULED, @@ -509,3 +501,40 @@ def update_from_output( return super().update_from_output(scheduler_output, model_runner_output) + + def _update_waiting_for_remote_kv(self, request: Request) -> bool: + """ + KV Connector: check if the request_id is finished_recving. + + The finished_recving_kv_req_ids list is populated + on the previous steps()'s update_from_output based + on the worker side connector. + + When the kv transfer is ready, we cache the blocks + and the request state will be moved back to WAITING from + WAITING_FOR_REMOTE_KV. + """ + assert self.connector is not None + if request.request_id not in self.finished_recving_kv_req_ids: + return False + + # Now that the blocks are ready, actually cache them. + (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id) + num_computed_tokens = len(block_ids) * self.block_size + # Handle the case where num request tokens less then one block. + num_computed_tokens = min(num_computed_tokens, request.num_tokens) + if num_computed_tokens == request.num_tokens: + num_computed_tokens -= 1 + + # This will cache the blocks if caching is enabled. + # Note: vllm fix this in main branch, but still have issue on v0.9.1, so we just adopt the + # change on 0.9.1 and without cherry-pick this back to main branch on vllm-ascend + if self.kv_cache_manager.enable_caching: + self.kv_cache_manager.cache_blocks(request, num_computed_tokens) + + # Update the request state for scheduling. + request.num_computed_tokens = num_computed_tokens + + # Return that we are ready. + self.finished_recving_kv_req_ids.remove(request.request_id) + return True diff --git a/vllm_ascend/device_allocator/camem.py b/vllm_ascend/device_allocator/camem.py index 7156602ecd..7592a04365 100644 --- a/vllm_ascend/device_allocator/camem.py +++ b/vllm_ascend/device_allocator/camem.py @@ -144,7 +144,7 @@ def get_instance() -> "CaMemAllocator": return CaMemAllocator.instance def __init__(self): - conf = os.environ.get("PYTORCH_NPU_ALLOC_CONF", "") + conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") assert "expandable_segments:True" not in conf, \ ("Expandable segments are not compatible with memory pool. " "Please track https://github.com/pytorch/pytorch/issues/147851 " diff --git a/vllm_ascend/distributed/__init__.py b/vllm_ascend/distributed/__init__.py index 88c2f2199b..d7be705c2b 100644 --- a/vllm_ascend/distributed/__init__.py +++ b/vllm_ascend/distributed/__init__.py @@ -25,3 +25,8 @@ KVConnectorFactory.register_connector( "AscendSimpleConnector", "vllm_ascend.distributed.kv_transfer.simple_connector", "SimpleConnector") + +KVConnectorFactory.register_connector( + "LLMDataDistCMgrConnector", + "vllm_ascend.distributed.llmdatadist_c_mgr_connector", + "LLMDataDistCMgrConnector") diff --git a/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py new file mode 100644 index 0000000000..34543cc05c --- /dev/null +++ b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py @@ -0,0 +1,789 @@ +import contextlib +import json +import math +import threading +import time +from collections.abc import Iterator +from dataclasses import dataclass +from typing import Any, Optional, Tuple + +import llm_datadist # type: ignore +import msgspec +import torch +import zmq +from llm_datadist import (BlocksCacheKey, CacheDesc, LLMConfig, LLMDataDist, + LLMException, LLMRole) +from vllm.config import KVTransferConfig, VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.distributed.parallel_state import get_tp_group, get_world_group +from vllm.forward_context import ForwardContext +from vllm.utils import get_ip, logger +from vllm.v1.core.kv_cache_manager import KVCacheBlocks +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.request import Request, RequestStatus + +from vllm_ascend import envs +from vllm_ascend.soc_info import NPUSocInfo + +TORCH_DTYPE_TO_NPU_DTYPE = { + torch.half: llm_datadist.DataType.DT_FLOAT16, + torch.float16: llm_datadist.DataType.DT_FLOAT16, + torch.bfloat16: llm_datadist.DataType.DT_BF16, + torch.float: llm_datadist.DataType.DT_FLOAT, + torch.float32: llm_datadist.DataType.DT_FLOAT, + torch.int8: llm_datadist.DataType.DT_INT8, + torch.int64: llm_datadist.DataType.DT_INT64, + torch.int32: llm_datadist.DataType.DT_INT32 +} + + +class LLMDataDistCMgrAgentMetadata(msgspec.Struct): + super_pod_id: str + server_id: str + device_id: str + device_ip: str + super_device_id: str + cluster_id: int + + +@dataclass +class ReqMeta: + local_block_ids: list[int] + remote_block_ids: list[int] + remote_host: str + remote_port: str + engine_id: str + remote_tp_size: str + + +class LLMDataDistCMgrConnectorMetadata(KVConnectorMetadata): + + def __init__(self): + self.requests: dict[str, ReqMeta] = {} + + def add_new_req(self, request_id: str, local_block_ids: list[int], + kv_transfer_params: dict[str, Any]): + self.requests[request_id] = ReqMeta( + local_block_ids=local_block_ids, + remote_block_ids=kv_transfer_params["remote_block_ids"], + engine_id=kv_transfer_params["remote_engine_id"], + remote_host=kv_transfer_params["remote_host"], + remote_port=kv_transfer_params["remote_port"], + remote_tp_size=kv_transfer_params["remote_tp_size"], + ) + + +class LLMDataDistCMgrConnector(KVConnectorBase_V1): + + def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): + assert vllm_config.kv_transfer_config is not None + self.engine_id = vllm_config.kv_transfer_config.engine_id + if role == KVConnectorRole.SCHEDULER: + self.connector_scheduler: Optional[ + LLMDataDistCMgrConnectorScheduler] = LLMDataDistCMgrConnectorScheduler( + vllm_config, self.engine_id) + elif role == KVConnectorRole.WORKER: + self.connector_scheduler = None + self.connector_worker = LLMDataDistCMgrConnectorWorker(vllm_config) + + ############################################################ + # Scheduler Side Methods + ############################################################ + + def get_num_new_matched_tokens( + self, request: "Request", + num_computed_tokens: int) -> tuple[int, bool]: + assert self.connector_scheduler is not None + return self.connector_scheduler.get_num_new_matched_tokens( + request, num_computed_tokens) + + def update_state_after_alloc(self, request: "Request", + blocks: "KVCacheBlocks", + num_external_tokens: int): + assert self.connector_scheduler is not None + return self.connector_scheduler.update_state_after_alloc( + request, blocks, num_external_tokens) + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> KVConnectorMetadata: + assert self.connector_scheduler is not None + return self.connector_scheduler.build_connector_meta(scheduler_output) + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, Optional[dict[str, Any]]]: + assert self.connector_scheduler is not None + return self.connector_scheduler.request_finished(request, block_ids) + + ############################################################ + # Worker Side Methods + ############################################################ + def register_kv_caches( + self, + kv_caches: dict[str, # type: ignore[override] + Tuple[torch.Tensor]]): + assert self.connector_worker is not None + self.connector_worker.register_kv_caches(kv_caches) + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[Optional[set[str]], Optional[set[str]]]: + """Get the finished recving and sending requests.""" + assert self.connector_worker is not None + return self.connector_worker.get_finished(finished_req_ids) + + def start_load_kv(self, forward_context: "ForwardContext", + **kwargs) -> None: + assert self.connector_worker is not None + assert isinstance(self._connector_metadata, + LLMDataDistCMgrConnectorMetadata) + self.connector_worker.start_load_kv(self._connector_metadata) + + def wait_for_layer_load(self, layer_name: str) -> None: + """LLMDataDistCMgrConnector does not do layerwise saving, the load is in blocking manager.""" + pass + + def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, + attn_metadata, **kwargs) -> None: + """LLMDataDistCMgrConnector does not save explicitly.""" + pass + + def wait_for_save(self): + """LLMDataDistCMgrConnector does not save explicitly.""" + pass + + +class LLMDataDistCMgrConnectorScheduler(): + + def __init__(self, vllm_config: VllmConfig, engine_id: Optional[str]): + self.vllm_config = vllm_config + self.block_size = vllm_config.cache_config.block_size + self.engine_id = engine_id + self.local_ip = get_ip() + # Can not retrieve the parallel config since it is not initialized. + self.local_dp_rank = None + self.tp_size = None + dp_rank_local = self.vllm_config.parallel_config.data_parallel_rank_local + tp_size = self.vllm_config.parallel_config.tensor_parallel_size + + self.port = dp_rank_local * tp_size + envs.VLLM_LLMDD_RPC_PORT if dp_rank_local is not None else tp_size + envs.VLLM_LLMDD_RPC_PORT + + self._reqs_need_recv: dict[str, tuple[Request, list[int]]] = {} + + def get_num_new_matched_tokens( + self, request: "Request", + num_computed_tokens: int) -> tuple[int, bool]: + """ + For remote prefill, pull all prompt blocks from remote + asynchronously relative to engine execution. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + Returns: + * the number of tokens that can be loaded from the + external KV cache beyond what is already computed. + * true if the external KV cache tokens will be loaded + asynchronously (between scheduler steps). + """ + + params = request.kv_transfer_params + logger.debug( + f"LLMDataDistCMgrConnector get_num_new_matched_tokens: num_computed_tokens={num_computed_tokens}, kv_transfer_params={params}" + ) + + if params is not None and params.get("do_remote_prefill"): + # Remote prefill: get all prompt blocks from remote. + assert num_computed_tokens % self.block_size == 0 + # Note: We use the full token count as transmit data here. + count = max(len(request.prompt_token_ids) - num_computed_tokens, 0) + return count, count > 0 + + # No remote prefill for this request. + return 0, False + + def update_state_after_alloc(self, request: Request, blocks: KVCacheBlocks, + num_externel_tokens: int): + params = request.kv_transfer_params + logger.debug( + f"LLMDataDistCMgrConnector update states num_externel_tokens: {num_externel_tokens} kv_transfer_params: {params}" + ) + if params is not None and params.get("do_remote_prefill"): + if params.get("remote_block_ids"): + if all(p in params for p in ("remote_engine_id", "remote_host", + "remote_port", "remote_tp_size")): + self._reqs_need_recv[request.request_id] = ( + request, blocks.get_unhashed_block_ids()) + else: + logger.warning("" \ + f"Invalid KVTransferParams {params}, This request will be discard") + else: + assert num_externel_tokens == 0 + params["do_remote_prefill"] = False + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> KVConnectorMetadata: + meta = LLMDataDistCMgrConnectorMetadata() + + for req_id, (req, block_ids) in self._reqs_need_recv.items(): + assert req.kv_transfer_params is not None + meta.add_new_req(request_id=req_id, + local_block_ids=block_ids, + kv_transfer_params=req.kv_transfer_params) + self._reqs_need_recv.clear() + + return meta + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, Optional[dict[str, Any]]]: + + params = request.kv_transfer_params + logger.debug( + "LLMDataDistCMgrConnector request_finished, request_status=%s, " + "kv_transfer_params=%s", request.status, params) + + if (params is None or not params.get("do_remote_decode") + or request.status != RequestStatus.FINISHED_LENGTH_CAPPED): + return False, None + + # note: NIXL transfer the full block only, but I don't see any reason to do that, so here + # we just transfer any data that computed from prefill node + # note: there might be some issue on this, check it if there is any unexpected result + computed_block_ids = block_ids + # If prompt < block_size, no xfer so free blocks immediately. + + return False, dict( + do_remote_prefill=True, + do_remote_decode=False, + remote_block_ids=computed_block_ids, + remote_engine_id=self.engine_id, + remote_host=self.local_ip, + remote_port=self.port, + remote_tp_size=str( + self.vllm_config.parallel_config.tensor_parallel_size), + ) + + +class LLMDataDistCMgrConnectorWorker(): + """ + Implementation of Worker side methods + """ + + def __init__(self, vllm_config: VllmConfig): + assert vllm_config.kv_transfer_config is not None + logger.info("Initialize the LLMDataDistCMgrConnectorWorker") + # we assume the local node only contains dp and tp, and tp will not communicate inter-node. + # for any scenario beyond this scope, the functionality of this connector is not guaranteed. + self.local_rank_on_node = get_world_group().rank % ( + vllm_config.parallel_config.data_parallel_size_local * + vllm_config.parallel_config.tensor_parallel_size) + self.local_rank = get_world_group().local_rank + self.local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local + self.tp_size = vllm_config.parallel_config.tensor_parallel_size + self.tp_rank = get_tp_group().rank_in_group + self.rank = get_world_group().rank + self.local_ip = get_ip() + self.kv_transfer_config: KVTransferConfig = vllm_config.kv_transfer_config + self.local_agent_metadata: Optional[ + LLMDataDistCMgrAgentMetadata] = None + self.vllm_config = vllm_config + + self.llm_datadist_role = None + self.llm_datadist_remote_role = None + if self.kv_transfer_config.kv_role == "kv_producer": + self.llm_datadist_role = LLMRole.PROMPT + self.llm_datadist_remote_role = LLMRole.DECODER + elif self.kv_transfer_config.kv_role == "kv_consumer": + self.llm_datadist_role = LLMRole.DECODER + self.llm_datadist_remote_role = LLMRole.PROMPT + else: + raise RuntimeError( + f"LLMDataDistWorker: Receive unexpected kv role in LLMDataDistWorker, this worker now only support kv_producer and kv_consumer, but receiving {vllm_config.kv_transfer_config.kv_role}" + ) + + # linked_cluster record the cluster that already build the connection its format should be {"cluster_id": "comm_name"} + self.linked_cluster: dict[Any, Any] = {} + self.prefill_device_list: list[tuple[int, int]] = [] + self.decode_device_list: list[tuple[int, int]] = [] + global_rank_table = self.read_offline_rank_table() + self.local_agent_metadata = self.read_agent_metadata( + global_rank_table, self.local_ip, self.local_rank_on_node, + self.llm_datadist_role) + self.llm_datadist = LLMDataDist(self.llm_datadist_role, + self.local_agent_metadata.cluster_id) + self.init_llm_datadist() + self.finished_reqs: set[str] = set() + self.soc_info = NPUSocInfo() + + def listen_for_agent_metadata_req(self, event: threading.Event): + assert self.local_agent_metadata is not None + port = envs.VLLM_LLMDD_RPC_PORT + self.local_dp_rank * self.tp_size + self.tp_rank if self.local_dp_rank is not None else envs.VLLM_LLMDD_RPC_PORT + self.tp_size + self.tp_rank + url = f"tcp://0.0.0.0:{port}" + msg_encoder = msgspec.msgpack.Encoder() + msg_decoder = msgspec.msgpack.Decoder() + msg_to_send = msg_encoder.encode(self.local_agent_metadata) + logger.debug(f"Start to listen to address: {url}") + logger.debug( + f"The local agent metadata have {len(msg_to_send)} bytes here") + logger.info( + f"LLMDataDistCMgrConnectorWorker: Cluster {self.local_agent_metadata.cluster_id} start to listen request from peers" + ) + with zmq_ctx(zmq.ROUTER, url) as sock: # type: ignore[attr-defined] + event.set() + while True: + identity, _, msg = sock.recv_multipart() + decode_msg = msg_decoder.decode(msg) + if "cluster_id" in decode_msg: + decode_msg = LLMDataDistCMgrAgentMetadata(**decode_msg) + logger.info( + f"LLMDataDistCMgrConnectorWorker: Receive message from cluster {decode_msg.cluster_id}" + ) + sock.send_multipart((identity, b"", msg_to_send)) + self.add_remote_agent(decode_msg) + else: + logger.warning( + f"LLMDataDistCMgrConnectorWorker: receiving unrecognized data {decode_msg}" + ) + + def init_llm_datadist(self): + assert self.local_agent_metadata is not None + llm_config = LLMConfig() + llm_config.device_id = self.local_rank + llm_config.sync_kv_timeout = 20000 + llm_config.enable_switch_role = True + llm_config.enable_cache_manager = True + llm_config.enable_remote_cache_accessible = True + llm_config_options = llm_config.generate_options() + self.llm_datadist.init(llm_config_options) + self.cache_manager = self.llm_datadist.cache_manager + logger.info( + f"Done initialize llm_datadist in rank {self.rank}, local rank {self.local_rank}, cluster id {self.local_agent_metadata.cluster_id}" + ) + + def read_offline_rank_table(self): + assert ( + envs.DISAGGREGATED_PREFILL_RANK_TABLE_PATH + ), "Please set path of rank_table to env variable DISAGGREGATED_PREFILL_RANK_TABLE_PATH" + rank_table_path = envs.DISAGGREGATED_PREFILL_RANK_TABLE_PATH + with open(rank_table_path, "r", encoding="utf-8") as f: + global_rank_table = json.load(f) + decode_device_list = global_rank_table["decode_device_list"] + for decode_device in decode_device_list: + server_id = decode_device["server_id"] + device_id = decode_device["device_id"] + self.decode_device_list.append((server_id, device_id)) + prefill_device_list = global_rank_table["prefill_device_list"] + for prefill_device in prefill_device_list: + server_id = prefill_device["server_id"] + device_id = prefill_device["device_id"] + self.prefill_device_list.append((server_id, device_id)) + + # global_rank_table = json.dumps(global_rank_table) + return global_rank_table + + def read_agent_metadata(self, global_rank_table, server_id, device_rank, + agent_role): + devices_type_list = [] + agent_metadata = None + if self.llm_datadist_role == LLMRole.PROMPT: + devices_type_list.append("prefill_device_list") + elif self.llm_datadist_role == LLMRole.DECODER: + devices_type_list.append("decode_device_list") + else: + devices_type_list.append("prefill_device_list") + devices_type_list.append("decode_device_list") + for device_type in devices_type_list: + device_list = global_rank_table[device_type] + device_list = [ + d for d in device_list if d.get("server_id") == server_id + ] + if len(device_list) <= device_rank: + continue + device_info = device_list[device_rank] + super_pod_id_ = device_info.get("super_pod_id", None) + server_id_ = device_info["server_id"] + device_id_ = device_info["device_id"] + device_ip_ = device_info["device_ip"] + super_device_id_ = device_info.get("super_device_id", None) + cluster_id_ = int(device_info["cluster_id"]) + agent_metadata = LLMDataDistCMgrAgentMetadata( + super_pod_id=super_pod_id_, + server_id=server_id_, + device_id=device_id_, + device_ip=device_ip_, + super_device_id=super_device_id_, + cluster_id=cluster_id_, + ) + assert agent_metadata is not None, f"Can't read the target server_id {server_id} and device_rank {device_rank} from rank table" + return agent_metadata + + def register_kv_caches(self, kv_caches: dict[str, Tuple[torch.Tensor]]): + _, first_kv_cache_tuple = next(iter(kv_caches.items())) + first_kv_cache = first_kv_cache_tuple[0] + assert len(first_kv_cache_tuple) > 1 + assert self.local_agent_metadata is not None + kv_cache_dtype = first_kv_cache.dtype + self.use_mla: bool = first_kv_cache_tuple[0].size( + -1) != first_kv_cache_tuple[1].size(-1) + # MLA case. [2 (k_normed, k_pe), num_blocks, ...] + # MHA case. [2 (k and v), num_blocks, ...] + self.num_blocks = first_kv_cache.shape[0] + block_rank = 3 # [block_size, latent_dim] + block_shape = first_kv_cache.shape[-block_rank:] + + self.block_len = math.prod(block_shape) + self.cache_addr: list[int] = [] + alignment = 2 * 1024 * 1024 + if self.use_mla: + cache_k_normed_addr_list = [] + cache_k_pe_addr_list = [] + k_normed = None + k_pe = None + for cache_or_caches in kv_caches.values(): + assert len(cache_or_caches) > 1 + k_normed, k_pe = cache_or_caches[0], cache_or_caches[1] + cache_k_normed_addr_list.append(k_normed.data_ptr()) + cache_k_pe_addr_list.append(k_pe.data_ptr()) + self.cache_addr = (cache_k_normed_addr_list, cache_k_pe_addr_list) + + cache_desc_k_normed = CacheDesc( + len(self.cache_addr[0]), [*k_normed.shape], + TORCH_DTYPE_TO_NPU_DTYPE[kv_cache_dtype]) + cache_desc_k_pe = CacheDesc( + len(self.cache_addr[1]), [*k_pe.shape], + TORCH_DTYPE_TO_NPU_DTYPE[kv_cache_dtype]) + cache_key_k_normed = BlocksCacheKey(cluster_id=int( + self.local_agent_metadata.cluster_id), + model_id=0) + cache_key_k_pe = BlocksCacheKey(cluster_id=int( + self.local_agent_metadata.cluster_id), + model_id=1) + self.cache_desc = (cache_desc_k_normed, cache_desc_k_pe) + self.cache_key = (cache_key_k_normed, cache_key_k_pe) + try: + cache_k_normed = self.cache_manager.register_blocks_cache( + self.cache_desc[0], self.cache_addr[0], self.cache_key[0]) + cache_k_pe = self.cache_manager.register_blocks_cache( + self.cache_desc[1], self.cache_addr[1], self.cache_key[1]) + self.cache = (cache_k_normed, cache_k_pe) + logger.info("LLMDataDistWorker: End of register Paged Cache.") + except (TypeError, ValueError): + raise RuntimeError( + f"LLMDataDistCMgrConnectorWorker: Passing unexpected parameter to register_block_cache, receiving [cache_desc: {self.cache_desc}, cache_addr: {self.cache_addr}, cache_key: {self.cache_key}]" + ) + else: + for cache_or_caches in kv_caches.values(): + for cache in cache_or_caches: + base_addr = cache.data_ptr() + assert base_addr % alignment == 0, "The address of the registered kv cache should be aligned to 2M" + self.cache_addr.append(base_addr) + # register paged kv cache into the llm_cache manager + self.cache_desc = CacheDesc( + len(self.cache_addr), [*cache.shape], + TORCH_DTYPE_TO_NPU_DTYPE[kv_cache_dtype]) + self.cache_key = BlocksCacheKey( + cluster_id=int(self.local_agent_metadata.cluster_id)) + logger.info( + f"num of cache: {len(self.cache_addr)}, size of cache: {[*cache.shape]}, real size of cache: {first_kv_cache.shape}" + ) + try: + self.cache = self.cache_manager.register_blocks_cache( + self.cache_desc, self.cache_addr, self.cache_key) + logger.info( + "LLMDataDistCMgrConnectorWorker: End of register Paged Cache." + ) + except (TypeError, ValueError): + raise RuntimeError( + f"LLMDataDistCMgrConnectorWorker: Passing unexpected parameter to register_block_cache, receiving [cache_desc: {self.cache_desc}, cache_addr: {self.cache_addr}, cache_key: {self.cache_key}]" + ) + self.ready_event = threading.Event() + self.metadata_agent_listener_t = threading.Thread( + target=self.listen_for_agent_metadata_req, + args=(self.ready_event, ), + daemon=True, + name="metadata_agent_listener") + self.metadata_agent_listener_t.start() + self.ready_event.wait() + + def start_load_kv(self, metadata: LLMDataDistCMgrConnectorMetadata): + for req_id, meta in metadata.requests.items(): + logger.debug(f"Start to transmit {req_id}") + self._read_blocks(meta.local_block_ids, + meta.remote_block_ids, meta.remote_host, + int(meta.remote_port), meta.engine_id, req_id, + meta.remote_tp_size) + self.finished_reqs.add(req_id) + + def add_remote_agent(self, metadata: LLMDataDistCMgrAgentMetadata) -> int: + assert self.local_agent_metadata is not None + remote_cluster_id = metadata.cluster_id + if remote_cluster_id in self.linked_cluster: + logger.debug( + f"LLMDataDistCMgrConnectorWorker: remote cluster_id: {metadata.cluster_id} already linked with this server, skip the connection" + ) + return remote_cluster_id + remote_super_pod_id = metadata.super_pod_id + remote_server_id = metadata.server_id + is_same_server = remote_server_id == self.local_agent_metadata.server_id + is_same_pod = remote_super_pod_id == self.local_agent_metadata.super_pod_id + if self.llm_datadist_role == LLMRole.PROMPT: + prefill_metadata = self.local_agent_metadata + decode_metadata = metadata + else: + prefill_metadata = metadata + decode_metadata = self.local_agent_metadata + comm_name = f"pd_comm_{prefill_metadata.device_ip}_{decode_metadata.device_ip}" + cluster_rank_info = { + prefill_metadata.cluster_id: 0, + decode_metadata.cluster_id: 1 + } + rank_table = {} + rank_table["version"] = "1.2" + rank_table["server_count"] = "1" if is_same_server else "2" + rank_table["status"] = "completed" + + # generate server_list for rank table + rank_table["server_list"] = [] # type: ignore[assignment] + decode_server_device_info = None + prefill_server_device_info = { + "device": [{ + k: v + for k, v in [( + "device_id", prefill_metadata.device_id + ), ("device_ip", prefill_metadata.device_ip + ), ("super_device_id", + prefill_metadata.super_device_id), ("rank_id", "0")] + if v is not None + }], + "server_id": + prefill_metadata.server_id + } + if is_same_server: + prefill_server_device_info["device"].append( # type: ignore[attr-defined] + { + k: v + for k, v in [( + "device_id", decode_metadata.device_id + ), ("device_ip", decode_metadata.device_ip + ), ("super_device_id", + decode_metadata.super_device_id), ("rank_id", "1")] + if v is not None + }) + else: + decode_server_device_info = { + "device": [{ + k: v + for k, v in [( + "device_id", decode_metadata.device_id + ), ("device_ip", decode_metadata.device_ip + ), ("super_device_id", + decode_metadata.super_device_id), ("rank_id", "1")] + if v is not None + }], + "server_id": + decode_metadata.server_id + } + rank_table["server_list"].append( # type: ignore[attr-defined] + prefill_server_device_info) + if decode_server_device_info is not None: + rank_table["server_list"].append( # type: ignore[attr-defined] + decode_server_device_info) + + if self.soc_info.is_a3: + # generate super_pod_list for rank table + super_pod_list = [] + prefill_super_pod_info = { + "super_pod_id": prefill_metadata.super_pod_id, + "server_list": [{ + "server_id": prefill_metadata.server_id + }], + } + if is_same_pod and not is_same_server: + prefill_super_pod_info[ + "server_list"].append( # type: ignore[attr-defined] + {"server_id": decode_metadata.server_id}) + super_pod_list.append(prefill_super_pod_info) + if not is_same_pod: + decode_super_pod_id = { + "super_pod_id": decode_metadata.super_pod_id, + "server_list": [{ + "server_id": decode_metadata.server_id + }], + } + super_pod_list.append(decode_super_pod_id) + rank_table[ + "super_pod_list"] = super_pod_list # type: ignore[assignment] + logger.info( + f"LLMDataDistCMgrConnectorWorker: try link with remote, comm id: {comm_name}" + ) + logger.info(f"rank table \n{rank_table}") + logger.info(f"comm name: {comm_name}") + logger.info(f"cluster rank info: {cluster_rank_info}") + comm_id = self.llm_datadist.link(comm_name, cluster_rank_info, + json.dumps(rank_table)) + while True: + ret = self.llm_datadist.query_register_mem_status(comm_id=comm_id) + if ret == llm_datadist.RegisterMemStatus.OK: + logger.info( + f"LLMDataDistCMgrConnectorWorker: Linking success, comm id: {comm_id}" + ) + break + elif ret == llm_datadist.RegisterMemStatus.FAILED: + raise RuntimeError( + f"LLMDataDistCMgrConnectorWorker: Linking failed, comm id: {comm_id}" + ) + time.sleep(1) + logger.info("Checking query_register_mem_status again") + self.linked_cluster.update({remote_cluster_id: comm_id}) + logger.info(f"cached linked cluster: {self.linked_cluster}") + logger.info( + f"Successfully build link with cluster id {remote_cluster_id} with cluster name {comm_name} !" + ) + return remote_cluster_id + + def remove_remote_agent(self, cluster_id: int): + if cluster_id not in self.linked_cluster: + logger.warning( + f"LLMDataDistCMgrConnectorWorker: Warning! Can't remove remote client with cluster id {cluster_id} for its not exist in linked_cluster list" + ) + comm_id = self.linked_cluster[cluster_id] + try: + self.llm_datadist.unlink(comm_id) + self.linked_cluster.pop(cluster_id) + except LLMException: + logger.error( + f"Try to remove remote client with cluster id {cluster_id} failed!, program won't terminate, but please carefully check your environment" + ) + logger.info( + f"Successfully remove remote client with cluster id {cluster_id} !" + ) + + def connect_to_remote_agent(self, host: str, port: int) -> int: + url = f"tcp://{host}:{port}" + logger.debug(f"Querying metadata from url: {url}") + msg_encoder = msgspec.msgpack.Encoder() + msg_send = msg_encoder.encode(self.local_agent_metadata) + with zmq_ctx(zmq.REQ, url) as sock: # type: ignore[attr-defined] + logger.info("Try request remote metadata from socket......") + sock.send(msg_send) + metadata_bytes = sock.recv() + decoder = msgspec.msgpack.Decoder() + metadata = decoder.decode(metadata_bytes) + metadata = LLMDataDistCMgrAgentMetadata(**metadata) + logger.info(f"recving metadata: {metadata}") + cluster_id = self.add_remote_agent(metadata) + return cluster_id + + def _read_blocks( + self, + local_block_ids: list[int], + remote_block_ids: list[int], + remote_ip: str, + remote_port: int, + remote_engine_id: str, + request_id: str, + remote_tp_size: str, + ): + # if remote_ip not in self.linked_cluster: + tp_offset = self.tp_rank % int(remote_tp_size) + remote_cluster_id = self.connect_to_remote_agent( + remote_ip, remote_port + tp_offset) + num_local_blocks = len(local_block_ids) + if num_local_blocks == 0: + return + num_remote_blocks = len(remote_block_ids) + assert num_local_blocks <= num_remote_blocks + if num_local_blocks < num_remote_blocks: + remote_block_ids = remote_block_ids[-num_local_blocks:] + + logger.info(f"remote cluster id is: {remote_cluster_id}") + if self.use_mla: + remote_cache_key_k_normed = BlocksCacheKey( + cluster_id=remote_cluster_id, model_id=0) + remote_cache_key_k_pe = BlocksCacheKey( + cluster_id=remote_cluster_id, model_id=1) + logger.info("Try pull blocks from remote server") + try: + self.cache_manager.pull_blocks( + remote_cache_key_k_normed, + self.cache[0], # type: ignore[has-type] + remote_block_ids, + local_block_ids) + self.cache_manager.pull_blocks( + remote_cache_key_k_pe, + self.cache[1], # type: ignore[has-type] + remote_block_ids, + local_block_ids) + except (TypeError, ValueError): + raise RuntimeError( + f"LLMDataDistCMgrConnectorWorker: Passing unexpected parameter to pull_blocks remote_cache_key: {remote_cache_key_k_normed} {remote_cache_key_k_pe}, cache: {self.cache}, local_block_ids: {local_block_ids}, remote_block_ids: {remote_block_ids}" # type: ignore[has-type] + ) + except LLMException: + raise RuntimeError( + "LLMDataDistCMgrConnectorWorker: Timeout during pull_blocks, you can try to increase the sync_kv_timeout config or checking your connect status" + ) + else: + remote_cache_key = BlocksCacheKey(cluster_id=remote_cluster_id) + logger.info("Try pull blocks from remote server") + try: + self.cache_manager.pull_blocks( + remote_cache_key, + self.cache, # type: ignore[has-type] + remote_block_ids, + local_block_ids) + except (TypeError, ValueError): + raise RuntimeError( + f"LLMDataDistCMgrConnectorWorker: Passing unexpected parameter to pull_blocks remote_cache_key: {remote_cache_key}, cache: {self.cache}, local_block_ids: {local_block_ids}, remote_block_ids: {remote_block_ids}" # type: ignore[has-type] + ) + except LLMException: + raise RuntimeError( + "LLMDataDistCMgrConnectorWorker: Timeout during pull_blocks, you can try to increase the sync_kv_timeout config or checking your connect status" + ) + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[Optional[set[str]], Optional[set[str]]]: + """Get the finished recving and sending requuests.""" + import copy + req_ids_to_ret = copy.deepcopy(self.finished_reqs) + self.finished_reqs.clear() + if self.llm_datadist_role == LLMRole.PROMPT: + return req_ids_to_ret, None + else: + return None, req_ids_to_ret + + +# adopt this from https://github.com/vllm-project/vllm/blob/main/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +@contextlib.contextmanager +def zmq_ctx(socket_type: Any, + addr: str) -> Iterator[zmq.Socket]: # type: ignore[name-defined] + """Context manager for a ZMQ socket""" + + ctx: Optional[zmq.Context] = None # type: ignore[name-defined] + try: + ctx = zmq.Context() # type: ignore[attr-defined] + + if socket_type == zmq.ROUTER: # type: ignore[attr-defined] + socket = ctx.socket(zmq.ROUTER) # type: ignore[attr-defined] + socket.bind(addr) + elif socket_type == zmq.REQ: # type: ignore[attr-defined] + socket = ctx.socket(zmq.REQ) # type: ignore[attr-defined] + socket.connect(addr) + else: + raise ValueError(f"Unexpected socket type: {socket_type}") + + yield socket + finally: + if ctx is not None: + ctx.destroy(linger=0) diff --git a/vllm_ascend/distributed/parallel_state.py b/vllm_ascend/distributed/parallel_state.py deleted file mode 100644 index 2778a6ef27..0000000000 --- a/vllm_ascend/distributed/parallel_state.py +++ /dev/null @@ -1,77 +0,0 @@ -from typing import Optional - -import torch -from vllm.distributed.parallel_state import (GroupCoordinator, get_world_group, - init_model_parallel_group) - -# vllm-ascend will maintain its own EP GroupCoordinator and ETP GroupCoordinator for -# customize parallel solution -_EP: Optional[GroupCoordinator] = None -_ETP: Optional[GroupCoordinator] = None - - -def get_ep_group() -> GroupCoordinator: - assert _EP is not None, ("expert model parallel group is not initialized") - return _EP - - -def get_etp_group() -> GroupCoordinator: - assert _ETP is not None, ( - "expert tensor parallel group is not initialized") - return _ETP - - -def model_parallel_initialized(): - return (_ETP is not None and _EP is not None) - - -def init_ascend_model_parallel( - expert_parallel_size: int = 1, - expert_tensor_parallel_size: int = 1, - world_size: Optional[int] = None, - backend: Optional[str] = None, -): - if model_parallel_initialized(): - return - assert torch.distributed.is_initialized() - world_size = world_size or torch.distributed.get_world_size() - backend = backend or torch.distributed.get_backend( - get_world_group().device_group) - num_expert_parallel_groups = expert_tensor_parallel_size - num_expert_tensor_parallel_groups = expert_parallel_size - - global _EP - group_ranks = [] - for i in range(num_expert_parallel_groups): - ranks = list(range(i, world_size, num_expert_parallel_groups)) - group_ranks.append(ranks) - - _EP = init_model_parallel_group(group_ranks, - get_world_group().local_rank, - backend, - group_name="ep") - - group_ranks = [] - global _ETP - for i in range(num_expert_tensor_parallel_groups): - ranks = list( - range(i * expert_tensor_parallel_size, - (i + 1) * expert_tensor_parallel_size)) - group_ranks.append(ranks) - - _ETP = init_model_parallel_group(group_ranks, - get_world_group().local_rank, - backend, - group_name="etp") - - -def destory_ascend_model_parallel(): - global _EP - if _EP: - _EP.destroy() - _EP = None - - global _ETP - if _ETP: - _ETP.destroy() - _ETP = None diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py index 02ecd6625b..27d0131720 100644 --- a/vllm_ascend/envs.py +++ b/vllm_ascend/envs.py @@ -116,6 +116,27 @@ # value to False to disable the optimized model. "USE_OPTIMIZED_MODEL": lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))), + # `LLMDataDistCMgrConnector` required variable. `DISAGGREGATED_PREFILL_RANK_TABLE_PATH` is + # used for llmdatadist to build the communication topology for kv cache transfer, it is + # a required variable if `LLMDataDistCMgrConnector` is used as kv connector for disaggregated + # pd. The rank table can be generated by adopting the script `gen_ranktable.sh` + # in vllm_ascend's example folder. + "DISAGGREGATED_PREFILL_RANK_TABLE_PATH": + lambda: os.getenv("DISAGGREGATED_PREFILL_RANK_TABLE_PATH", None), + # `LLMDataDistCMgrConnector` required variable. `VLLM_ASCEND_LLMDD_RPC_IP` is used as the + # rpc communication listening ip, which will be used to receive the agent metadata from the + # remote worker. + "VLLM_ASCEND_LLMDD_RPC_IP": + lambda: os.getenv("VLLM_ASCEND_LLMDD_RPC_IP", "0.0.0.0"), + # `LLMDataDistCMgrConnector` required variable. `VLLM_LLMDD_RPC_PORT` is used as the + # rpc communication listening port, which will be used to receive the agent metadata from the + # remote worker. + "VLLM_LLMDD_RPC_PORT": + lambda: int(os.getenv("VLLM_LLMDD_RPC_PORT", 5557)), + # Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible + # and the mla_pa will be the default path of deepseek decode path. + "VLLM_ASCEND_MLA_PA": + lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0)) } # end-env-vars-definition diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py index 490cd4ed5e..d85572b32c 100644 --- a/vllm_ascend/models/__init__.py +++ b/vllm_ascend/models/__init__.py @@ -35,14 +35,19 @@ def register_model(): ModelRegistry.register_model( "DeepseekV2ForCausalLM", "vllm_ascend.models.deepseek_dbo:CustomDeepseekDBOForCausalLM") + + ModelRegistry.register_model( + "DeepseekV3ForCausalLM", + "vllm_ascend.models.deepseek_dbo:CustomDeepseekDBOForCausalLM") + else: ModelRegistry.register_model( "DeepseekV2ForCausalLM", "vllm_ascend.models.deepseek_v2:CustomDeepseekV2ForCausalLM") - ModelRegistry.register_model( - "DeepseekV3ForCausalLM", - "vllm_ascend.models.deepseek_v2:CustomDeepseekV3ForCausalLM") + ModelRegistry.register_model( + "DeepseekV3ForCausalLM", + "vllm_ascend.models.deepseek_v2:CustomDeepseekV3ForCausalLM") ModelRegistry.register_model( "Qwen3MoeForCausalLM", diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py index 6ab0837e37..000bd39ed5 100644 --- a/vllm_ascend/models/deepseek_dbo.py +++ b/vllm_ascend/models/deepseek_dbo.py @@ -25,38 +25,33 @@ # # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py # """Inference-only DeepseekV2/DeepseekV3 model.""" -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional, Union import torch import torch.distributed as dist import torch_npu # noqa: F401 -import vllm.envs as envs from torch import nn from transformers import PretrainedConfig -from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig, ModelConfig, VllmConfig -from vllm.distributed import (get_pp_group, +from vllm.attention import AttentionMetadata +from vllm.config import (CacheConfig, ModelConfig, VllmConfig, + get_current_vllm_config) +from vllm.distributed import (get_ep_group, get_pp_group, + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tp_group, tensor_model_parallel_all_reduce) from vllm.distributed.parallel_state import get_dp_group from vllm.forward_context import get_forward_context from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - ReplicatedLinear, - RowParallelLinear) +from vllm.model_executor.layers.linear import (ReplicatedLinear, + UnquantizedLinearMethod) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.models.deepseek_v2 import \ DeepseekV2ForCausalLM # noqa: E501 -from vllm.model_executor.models.deepseek_v2 import \ - yarn_get_mscale # noqa: E501 -from vllm.model_executor.models.deepseek_v2 import (DeepseekV2Attention, - DeepseekV2DecoderLayer, - DeepseekV2MLAAttention) +from vllm.model_executor.models.deepseek_v2 import DeepseekV2DecoderLayer from vllm.model_executor.models.utils import ( PPMissingLayer, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -64,7 +59,9 @@ import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2MLP +from vllm_ascend.ascend_forward_context import FusedMoEState +from vllm_ascend.models.deepseek_v2 import (CustomDeepseekV2MLAAttention, + CustomDeepseekV2MLP) from vllm_ascend.multistream.base import MSEventKey from vllm_ascend.multistream.context import ( advance_step_multistream_layer_context, get_multistream_comm_context, @@ -74,8 +71,9 @@ from vllm_ascend.multistream.metadata import (MultiStreamConfig, MultiStreamStepMetadata, make_multistream_metadata_ds) -from vllm_ascend.multistream.ms_split import compute_split_seq_index from vllm_ascend.ops.fused_moe import AscendFusedMoE +from vllm_ascend.quantization.w8a8_dynamic import ( + AscendW8A8DynamicLinearMethod, apply_mlp) from vllm_ascend.utils import dispose_tensor VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO @@ -83,16 +81,48 @@ class CustomDeepseekDBOMLP(CustomDeepseekV2MLP): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, + prefix: str = "", + ) -> None: + super().__init__(hidden_size=hidden_size, + intermediate_size=intermediate_size, + hidden_act=hidden_act, + quant_config=quant_config, + prefix=prefix) + self.is_dynamic_quant = not isinstance( + self.gate_up_proj.quant_method, + UnquantizedLinearMethod) and isinstance( + self.gate_up_proj.quant_method.quant_method, + AscendW8A8DynamicLinearMethod) + def _forward_ms_mlp(self, x): current_ms_metadata = get_multistream_comm_context() assert current_ms_metadata is not None gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - current_ms_metadata.before_comm_event.record() - with torch.npu.stream(current_ms_metadata.comm_stream): - current_ms_metadata.before_comm_event.wait() + if self.is_dynamic_quant: + x, dynamic_scale = self.act_fn(gate_up) + x = torch_npu.npu_quant_matmul( + x, + self.down_proj.weight, + self.down_proj.weight_scale, + pertoken_scale=dynamic_scale, + output_dtype=torch.bfloat16, + ) + if self.down_proj.reduce_results and self.down_proj.tp_size > 1: + current_ms_metadata.before_comm_event.record() + with torch.npu.stream(current_ms_metadata.comm_stream): + current_ms_metadata.before_comm_event.wait() + x = tensor_model_parallel_all_reduce(x) + current_ms_metadata.after_comm_event.record() + else: + x = self.act_fn(gate_up) x, _ = self.down_proj(x) - current_ms_metadata.after_comm_event.record() return x @@ -163,7 +193,10 @@ def __init__( self.tp_group = get_tp_group().device_group self.tp_rank = get_tp_group().rank_in_group - + self.kv_consumer = None + transfer_config = get_current_vllm_config().kv_transfer_config + if transfer_config is not None: + self.kv_consumer = transfer_config.kv_role = "kv_consumer" self.params_dtype = torch.get_default_dtype() ascend_config = get_ascend_config() @@ -173,39 +206,34 @@ def forward( self, hidden_states: torch.Tensor, attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: + forward_context = get_forward_context() if attn_metadata is None: - attn_metadata = get_forward_context().attn_metadata + attn_metadata = forward_context.attn_metadata + # when profile runs, force experts to load balanced tokens # to avoid high memory consumption on a single rank. - # TODO: need a better flag to indicate whether in profile run or not. - if attn_metadata is None: - # for profile run - is_prefill = True - enable_force_load_balance = True - else: - is_prefill = attn_metadata.num_prefills > 0 - enable_force_load_balance = False - if hasattr(attn_metadata, 'with_prefill_across_dp'): - is_prefill = is_prefill or attn_metadata.with_prefill_across_dp + enable_force_load_balance = forward_context.in_profile_run - old_hidden_states = hidden_states.clone() + is_prefill = forward_context.with_prefill + # If this node is kv_consumer, we force the moe always runs in decode path to make sure + # the behaviour aligned between dummy_run and normal model_execute. + if self.kv_consumer: + is_prefill = False # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - hidden_states = self.experts( + experts_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits, is_prefill=is_prefill, top_k=CustomDeepseekDBOMoE.top_k, enable_force_load_balance=enable_force_load_balance, - ) * self.routed_scaling_factor - - if self.n_shared_experts is not None: - shared_output = self.shared_experts(old_hidden_states) + shared_experts=self.shared_experts) - if shared_output is not None: - hidden_states = hidden_states + shared_output + hidden_states = ( + experts_hidden_states[0] * self.routed_scaling_factor + + experts_hidden_states[1]) return hidden_states @@ -225,199 +253,6 @@ def _forward_ms_op_gate( router_logits, _ = self.gate(hidden_states) return router_logits - def _forward_ms_op_tp_allgather( - self, - hidden_states: torch.Tensor, - chunk_hidden_states: torch.Tensor, - num_tokens: int = 0, - ): - current_ms_metadata = get_multistream_comm_context() - if current_ms_metadata is None: - dist.all_gather(list(chunk_hidden_states), hidden_states, - self.tp_group) - final_hidden_states = torch.cat(chunk_hidden_states, dim=0) - if num_tokens > 0: - final_hidden_states = final_hidden_states[:-num_tokens] - else: - current_ms_metadata.before_comm_event.record() - with torch.npu.stream(current_ms_metadata.comm_stream): - current_ms_metadata.before_comm_event.wait() - dist.all_gather(list(chunk_hidden_states), hidden_states, - self.tp_group) - final_hidden_states = torch.cat(chunk_hidden_states, dim=0) - if num_tokens > 0: - final_hidden_states = final_hidden_states[:-num_tokens] - current_ms_metadata.after_comm_event.record() - return final_hidden_states - - -class CustomDeepseekDBOMLAAttention(DeepseekV2MLAAttention): - - def __init__( - self, - config: PretrainedConfig, - hidden_size: int, - num_heads: int, - qk_nope_head_dim: int, - qk_rope_head_dim: int, - v_head_dim: int, - q_lora_rank: Optional[int], - kv_lora_rank: int, - rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, - max_position_embeddings: int = 8192, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - nn.Module.__init__(self) - self.hidden_size = hidden_size - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_rope_head_dim = qk_rope_head_dim - self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim - self.v_head_dim = v_head_dim - - self.q_lora_rank = q_lora_rank - self.kv_lora_rank = kv_lora_rank - - self.num_heads = num_heads - tp_size = get_tensor_model_parallel_world_size() - assert num_heads % tp_size == 0 - self.num_local_heads = num_heads // tp_size - - self.scaling = self.qk_head_dim**-0.5 - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - - if self.q_lora_rank is not None: - self.q_a_proj = ReplicatedLinear(self.hidden_size, - self.q_lora_rank, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.q_a_proj") - self.q_a_layernorm = RMSNorm(self.q_lora_rank, - eps=config.rms_norm_eps) - self.q_b_proj = ColumnParallelLinear(q_lora_rank, - self.num_heads * - self.qk_head_dim, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.q_b_proj") - else: - self.q_proj = ColumnParallelLinear(self.hidden_size, - self.num_heads * - self.qk_head_dim, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.q_proj") - - self.kv_a_proj_with_mqa = ReplicatedLinear( - self.hidden_size, - self.kv_lora_rank + self.qk_rope_head_dim, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.kv_a_proj_with_mqa") - self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, - eps=config.rms_norm_eps) - self.kv_b_proj = ColumnParallelLinear( - self.kv_lora_rank, - self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.kv_b_proj") - self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim, - self.hidden_size, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.o_proj") - - if rope_scaling: - rope_scaling["rope_type"] = 'deepseek_yarn' - self.rotary_emb = get_rope(qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - is_neox_style=False) - if rope_scaling: - mscale_all_dim = rope_scaling.get("mscale_all_dim", False) - scaling_factor = rope_scaling["factor"] - mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) - self.scaling = self.scaling * mscale * mscale - - # In the MLA backend, kv_cache includes both k_c and - # pe (i.e. decoupled position embeddings). In particular, - # the concat_and_cache_mla op requires - # k_c.size(1) + k_pe.size(1) == kv_cache.size(2) - # i.e. - # kv_lora_rank + qk_rope_head_dim == head_size - self.mla_attn = Attention( - num_heads=self.num_local_heads, - head_size=self.kv_lora_rank + self.qk_rope_head_dim, - scale=self.scaling, - num_kv_heads=1, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn", - use_mla=True, - # MLA Args - q_lora_rank=self.q_lora_rank, - kv_lora_rank=self.kv_lora_rank, - qk_nope_head_dim=self.qk_nope_head_dim, - qk_rope_head_dim=self.qk_rope_head_dim, - qk_head_dim=self.qk_head_dim, - v_head_dim=self.v_head_dim, - rotary_emb=self.rotary_emb, - q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj, - kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, - kv_a_layernorm=self.kv_a_layernorm, - kv_b_proj=self.kv_b_proj, - o_proj=self.o_proj, - ) - - self.prefix = prefix - self.debug_layer_idx = int(self.prefix.split(".")[-2]) - - ascend_config = get_ascend_config() - self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: Optional[torch.Tensor] = None, - attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: - if self.q_lora_rank is not None: - ckq = self.q_a_proj(hidden_states)[0] - hidden_states_or_q_c = self.q_a_layernorm(ckq) - else: - hidden_states_or_q_c = hidden_states - if self.torchair_graph_enabled: - forward_kwargs = {} - if envs.VLLM_USE_V1: - output_shape = hidden_states.shape - output = torch.empty(output_shape, - dtype=hidden_states_or_q_c.dtype, - device=hidden_states_or_q_c.device) - forward_kwargs['output'] = output - - output = self.mla_attn.impl.forward(self.mla_attn, - hidden_states_or_q_c, - hidden_states, None, kv_cache, - attn_metadata, - **forward_kwargs) - if envs.VLLM_USE_V1: - output = output.view(-1, output_shape[-1]) - return output - else: - kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split( - [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) - return self.mla_attn(hidden_states_or_q_c, - kv_c_normed, - k_pe, - output_shape=hidden_states.shape) - class CustomDeepseekDBODecoderLayer(DeepseekV2DecoderLayer): @@ -440,10 +275,7 @@ def __init__( layer_idx = int(prefix.split(sep='.')[-1]) self.layer_idx = layer_idx # TODO: enable mla in vllm-ascend - if model_config.use_mla: - attn_cls = CustomDeepseekDBOMLAAttention - else: - attn_cls = DeepseekV2Attention + attn_cls = CustomDeepseekV2MLAAttention self.self_attn = attn_cls( config=config, hidden_size=self.hidden_size, @@ -461,6 +293,10 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.self_attn", ) + self.tp_size = get_tensor_model_parallel_world_size() + self.dp_size = get_dp_group().world_size + self.tp_group = get_tp_group().device_group + self.global_num_experts = config.n_routed_experts if (config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace @@ -566,7 +402,26 @@ def _forward_ms_layer( shared_outputs = [] router_logits = [] chunk_hidden_states = [] - + chunk_router_logits = [] + topk_weights = [] + topk_ids = [] + num_moe_tokens = [] + original_shapes = [] + expanded_row_idx = [] + scatter_size_list = [] + gather_size_list = [] + local_expert_idx = [] + scatter_sizes = [] + expanded_expert_idx = [] + sorted_local_expert_idx = [] + sorted_idx = [] + + global_num_experts = len( + self.mlp.experts.expert_map + ) if self.mlp.experts.expert_map is not None else self.global_num_experts + ep_group = get_ep_group() + local_num_experts = global_num_experts // ep_group.world_size + fused_moe_state = get_forward_context().fused_moe_state # block 1 : attention # block 2 : attn tp communication # the attn computation of microbatch 1 can be overlapped with the moe @@ -631,88 +486,221 @@ def _forward_ms_layer( # when profile runs, force experts to load balanced tokens # to avoid high memory consumption on a single rank. # TODO: need a better flag to indicate whether in profile run or not. - if attn_metadata[i] is None: - # for profile run - is_prefill = True - enable_force_load_balance = True - else: - is_prefill = attn_metadata[i].num_prefills > 0 - enable_force_load_balance = False - - if self.mlp.tp_size > 1: - num_token, _ = hidden_states[i].shape - padded_num_tokens = (self.mlp.tp_size - num_token % - self.mlp.tp_size) % self.mlp.tp_size - if padded_num_tokens > 0: - hidden_states[i] = nn.functional.pad( - hidden_states[i], (0, 0, 0, padded_num_tokens)) - chunk_hidden_state = torch.tensor_split(hidden_states[i], - self.mlp.tp_size, - dim=0) - chunk_hidden_states.append(chunk_hidden_state) - local_hidden_states = chunk_hidden_state[self.mlp.tp_rank] - else: - local_hidden_states = hidden_states[i] - - router_logit = self.mlp._forward_ms_op_gate(local_hidden_states) + router_logit = self.mlp._forward_ms_op_gate(hidden_states[i]) router_logits.append(router_logit) if CustomDeepseekDBOMoE.top_k: real_top_k = CustomDeepseekDBOMoE.top_k else: real_top_k = self.mlp.experts.top_k + if (self.tp_size > 1 + and fused_moe_state != FusedMoEState.AllGather): + if num_tokens[i] < self.tp_size: + hidden_states[i] = nn.functional.pad( + hidden_states[i], + (0, 0, 0, self.tp_size - num_tokens[i])) + router_logits[i] = nn.functional.pad( + router_logits[i], + (0, 0, 0, self.tp_size - num_tokens[i])) + chunk_hidden_state = torch.tensor_split(hidden_states[i], + self.tp_size, + dim=0) + chunk_hidden_states.append(chunk_hidden_state) + chunk_router_logit = torch.tensor_split(router_logits[i], + self.tp_size, + dim=0) + chunk_router_logits.append(chunk_router_logit) + tp_rank = get_tensor_model_parallel_rank() + hidden_states[i] = chunk_hidden_states[i][tp_rank] + router_logits[i] = chunk_router_logits[i][tp_rank] + + if self.dp_size > 1 and fused_moe_state == FusedMoEState.AllGather: + if attn_metadata[i] is not None: + max_num_tokens_across_dp = attn_metadata[ + i].max_tokens_across_dp + if num_tokens[i] < max_num_tokens_across_dp: + hidden_states[i] = nn.functional.pad( + hidden_states[i], + (0, 0, 0, + max_num_tokens_across_dp - num_tokens[i])) + router_logits[i] = nn.functional.pad( + router_logits[i], + (0, 0, 0, + max_num_tokens_across_dp - num_tokens[i])) + hidden_states[i] = get_dp_group().all_gather( + hidden_states[i], 0) + router_logits[i] = get_dp_group().all_gather( + router_logits[i], 0) + + # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern + if global_num_experts == 256: + topk_weight, topk_id, _ = torch_npu.npu_moe_gating_top_k( + router_logits[i], + k=real_top_k, # topk当前写8 + bias=self.mlp.experts.e_score_correction_bias, + k_group=self.mlp.experts.topk_group, # fix: 4 + group_count=self.mlp.experts.num_expert_group, # fix 8 + group_select_mode=1, # 0: group中的最大; 1: topk2.sum(fix) + renorm=0, # 0: softmax->topk(fix); 1: topk->softmax + norm_type=1, # 0: softmax; 1: sigmoid(fix) + # out_flag=False, # todo new api; 第三个输出是否输出 + # y2_flag=False, # old api; 第三个输出是否输出 + routed_scaling_factor=1, + eps=float(1e-20)) + else: + topk_weight, topk_id = self.mlp.experts.select_experts( + hidden_states=hidden_states[i], + router_logits=router_logits[i], + top_k=real_top_k, + use_grouped_topk=self.mlp.experts.use_grouped_topk, + renormalize=self.mlp.experts.renormalize, + topk_group=self.mlp.experts.topk_group, + num_expert_group=self.mlp.experts.num_expert_group, + custom_routing_function=self.mlp.experts. + custom_routing_function, + scoring_func=self.mlp.experts.scoring_func, + e_score_correction_bias=self.mlp.experts. + e_score_correction_bias, + ) + topk_weight = topk_weight.to(hidden_states[i].dtype) + topk_weights.append(topk_weight) + topk_ids.append(topk_id) + original_shape = hidden_states[i].shape + original_shapes.append(original_shape) + if len(original_shapes[i]) == 3: + hidden_states[i] = hidden_states[i].view( + -1, hidden_states[i].shape[-1]) + num_token, _ = hidden_states[i].shape + num_moe_tokens.append(num_token) + device = hidden_states[i].device + + row_idx_len = num_moe_tokens[i] * real_top_k + row_idx = (torch.arange(0, + row_idx_len, + dtype=torch.int32, + device=device).view(real_top_k, + -1).permute( + 1, 0).contiguous()) + hidden_states[ + i], expanded_row_idx_i, expanded_expert_idx_i = torch_npu.npu_moe_init_routing( + hidden_states[i], + row_idx=row_idx, + expert_idx=topk_ids[i], + active_num=num_moe_tokens[i]) + expanded_row_idx.append(expanded_row_idx_i) + expanded_expert_idx.append(expanded_expert_idx_i) - hidden_states[i] = self.mlp.experts._forward_ms_fused_moe_comp( - local_hidden_states, router_logits[i], is_prefill, real_top_k, - enable_force_load_balance) - - # the following kernels will be submitted to the comm stream to overlap the computation of the - # moe computation of next microbatch and the attn computation of next layer context = MultiStreamStepMetadata( comm_stream=ms_metadata.communicate_stream, before_comm_event=ms_metadata.ms_events[layer_index][i][ - MSEventKey.FFN_COM_FINISH], + MSEventKey.MOE_ALL_TO_ALL], after_comm_event=ms_metadata.ms_events[layer_index][i][ - MSEventKey.MOE_AFTER_COMM], + MSEventKey.MOE_ALL_TO_ALL_FINISH], ) context.before_comm_event.record() with torch.npu.stream(ms_metadata.communicate_stream): context.before_comm_event.wait() - if self.mlp.experts.reduce_results and ( - self.mlp.experts.tp_size > 1 - or self.mlp.experts.ep_size > 1): - hidden_states[i] = tensor_model_parallel_all_reduce( - hidden_states[i]) - hidden_states[ - i] = hidden_states[i] * self.mlp.routed_scaling_factor + global_expert_tokens = torch.bincount( + expanded_expert_idx[i], minlength=global_num_experts) + scatter_size = global_expert_tokens.view( + ep_group.world_size, -1).sum(-1) + scatter_sizes.append(scatter_size) + gather_sizes = torch.empty_like(scatter_sizes[i]) + dist.all_to_all_single(gather_sizes, + scatter_sizes[i], + group=ep_group.device_group) + scatter_size_list_i = scatter_sizes[i].cpu().tolist() + gather_size_list_i = gather_sizes.cpu().tolist() + scatter_size_list.append(scatter_size_list_i) + gather_size_list.append(gather_size_list_i) + expanded_expert_idx[ + i] = expanded_expert_idx[i] % local_num_experts + hidden_states[i] = ep_group.all_to_all(hidden_states[i], 0, 0, + scatter_size_list[i], + gather_size_list[i]) + local_expert_idx_i = ep_group.all_to_all( + expanded_expert_idx[i], 0, 0, scatter_size_list[i], + gather_size_list[i]) + local_expert_idx.append(local_expert_idx_i) + + sorted_local_expert_idx_i, sorted_idx_i = torch.sort( + local_expert_idx[i]) + sorted_local_expert_idx.append(sorted_local_expert_idx_i) + sorted_idx.append(sorted_idx_i) context.after_comm_event.record() + for i in range(num_micro_batchs): + ms_metadata.try_wait_event(layer_index, i, + MSEventKey.MOE_ALL_TO_ALL_FINISH) + expert_tokens = torch_npu.npu_moe_compute_expert_tokens( + sorted_local_expert_idx[i], local_num_experts).to(torch.int64) + group_list_type = 0 + hidden_states[i] = hidden_states[i][sorted_idx[i]] + hidden_states[i] = apply_mlp( + hidden_states[i], + self.mlp.experts.w13_weight, + self.mlp.experts.w13_weight_scale, #17 + self.mlp.experts.w2_weight, + self.mlp.experts.w2_weight_scale, + expert_tokens, #16 + group_list_type=group_list_type, + w1_scale_bias=None, + w2_scale_bias=None) + + resorted_idx = torch.argsort(sorted_idx[i]) + hidden_states[i] = hidden_states[i][resorted_idx] + hidden_states[i] = ep_group.all_to_all(hidden_states[i], 0, 0, + gather_size_list[i], + scatter_size_list[i]) + + hidden_states[i] = torch_npu.npu_moe_finalize_routing( + hidden_states[i], + skip1=None, + skip2=None, + bias=None, + scales=topk_weights[i], + expanded_src_to_dst_row=expanded_row_idx[i], + export_for_source_row=topk_ids[i], + ) + if len(original_shapes[i]) == 3: + hidden_states[i] = hidden_states[i].view(original_shapes[i]) + + # the following kernels will be submitted to the comm stream to overlap the computation of the + # moe computation of next microbatch and the attn computation of next layer context = MultiStreamStepMetadata( comm_stream=ms_metadata.communicate_stream, before_comm_event=ms_metadata.ms_events[layer_index][i][ - MSEventKey.MOE_AFTER_COMM], + MSEventKey.FFN_COM_FINISH], after_comm_event=ms_metadata.ms_events[layer_index][i][ MSEventKey.FFN_AR_FINISH], ) - with set_multistream_context(context, i): - if self.mlp.tp_size > 1: - hidden_states[i] = self.mlp._forward_ms_op_tp_allgather( - hidden_states[i], chunk_hidden_states[i], - padded_num_tokens) + context.before_comm_event.record() with torch.npu.stream(ms_metadata.communicate_stream): + context.before_comm_event.wait() + if (self.tp_size > 1 + and fused_moe_state != FusedMoEState.AllGather): + dist.all_gather(list(chunk_hidden_states[i]), + hidden_states[i], self.tp_group) + hidden_states[i] = torch.cat(chunk_hidden_states[i], dim=0) + if num_tokens[i] < self.tp_size: + hidden_states[i] = hidden_states[i][:num_tokens[i]] + elif self.dp_size > 1 and fused_moe_state == FusedMoEState.AllGather: + hidden_states[ + i] = dist._functional_collectives.reduce_scatter_tensor( + hidden_states[i], + "sum", + scatter_dim=0, + group=get_dp_group().device_group) + hidden_states[i] = hidden_states[i][:num_tokens[i]] + if self.tp_size > 1 and fused_moe_state == FusedMoEState.AllGather: + hidden_states[i] = tensor_model_parallel_all_reduce( + hidden_states[i]) # last if shared_outputs[i] is not None: - hidden_states[i] = hidden_states[i] + shared_outputs[i] + hidden_states[i] = hidden_states[ + i] * self.routed_scaling_factor + shared_outputs[i] hidden_states[i] = hidden_states[i].view( num_tokens[i], hidden_dims[i]) - if isinstance(self.mlp, CustomDeepseekDBOMLP - ) and hidden_states[i].dtype == torch.float16: - # Fix FP16 overflow - # Scaling the DeepseekV2MLP output, it is the input of - # input_layernorm of next decoder layer. - # The scaling of DeepseekV2MOE output would be done in the forward - # of DeepseekV2MOE - hidden_states[i] *= 1. / self.routed_scaling_factor context.after_comm_event.record() return hidden_states, residual @@ -767,9 +755,7 @@ def _forward_ms_op_post_attn_layernorm( class CustomDeepseekDBOModel(nn.Module): - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config @@ -835,6 +821,7 @@ def forward( attn_metadata: Optional[AttentionMetadata] = None, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, + graph_enable: Optional[bool] = True ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: if inputs_embeds is not None: @@ -848,10 +835,12 @@ def forward( residual = intermediate_tensors["residual"] num_normal_layers = (self.first_k_dense_replace - if VLLM_ASCEND_ENABLE_DBO and self.can_run_ms() - else self.end_layer - self.start_layer) + if VLLM_ASCEND_ENABLE_DBO and not graph_enable + and self.can_run_ms() else self.end_layer - + self.start_layer) - for i in range(self.start_layer, self.start_layer + num_normal_layers): + moe_start_layer = self.start_layer + num_normal_layers + for i in range(self.start_layer, min(moe_start_layer, self.end_layer)): layer = self.layers[i] hidden_states, residual = layer( positions, hidden_states, residual, @@ -859,8 +848,7 @@ def forward( self.start_layer] if kv_caches is not None else None, attn_metadata) - moe_start_layer = self.start_layer + num_normal_layers - if moe_start_layer != self.end_layer: + if moe_start_layer < self.end_layer: # if we enable multistream/dbo, process sparse layers here hidden_states, residual = self._forward_ms_layers( positions=positions, @@ -881,34 +869,18 @@ def forward( def can_run_ms(self): attn_metadata = get_forward_context().attn_metadata - # support mla attention and V1 engine at present - if not self.use_mla or not envs.VLLM_USE_V1: - return False # enable prefill overlap - if attn_metadata is None or attn_metadata.num_prefills == 0: - return False - else: - [token_index, seq_index - ] = compute_split_seq_index(attn_metadata.query_lens, - attn_metadata.attn_state, - attn_metadata.num_decode_tokens) - if token_index == 0 or seq_index == 0 or seq_index == len( - attn_metadata.query_lens): - return False - # check whether the total tokens exceed the threshold - if self.multistream_config is None or attn_metadata.num_actual_tokens < self.multistream_config.min_total_tokens_to_split: + if attn_metadata is None or attn_metadata.num_prefills == 0 or not attn_metadata.enable_dbo_across_dp: return False return True - def _forward_ms_layers( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - residual: torch.Tensor, - moe_start_layer: int, - kv_caches: Optional[List[torch.Tensor]] = None, - is_prefill: bool = False, - ): + def _forward_ms_layers(self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor, + moe_start_layer: int, + kv_caches: Optional[List[torch.Tensor]] = None, + is_prefill: bool = False): if moe_start_layer == self.end_layer: return hidden_states, residual @@ -970,8 +942,9 @@ def forward( attn_metadata: Optional[AttentionMetadata] = None, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, + graph_enable: Optional[bool] = True ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, attn_metadata, intermediate_tensors, - inputs_embeds) + inputs_embeds, graph_enable) return hidden_states diff --git a/vllm_ascend/models/deepseek_mtp.py b/vllm_ascend/models/deepseek_mtp.py index 979a6099f1..400c7a0acf 100644 --- a/vllm_ascend/models/deepseek_mtp.py +++ b/vllm_ascend/models/deepseek_mtp.py @@ -28,8 +28,8 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import get_sampler -from vllm.model_executor.layers.vocab_parallel_embedding import \ - VocabParallelEmbedding +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.models.deepseek_mtp import ( DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer, SharedHead) @@ -40,6 +40,20 @@ from .deepseek_v2 import CustomDeepseekV2DecoderLayer +class CustomDeepSeekShareHead(SharedHead): + + def __init__(self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: + nn.Module.__init__(self) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "head")) + + class CustomDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer): def __init__( @@ -61,7 +75,10 @@ def __init__( self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False) - self.shared_head = SharedHead(config=config, quant_config=quant_config) + self.shared_head = CustomDeepSeekShareHead(config=config, + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "shared_head")) self.mtp_block = CustomDeepseekV2DecoderLayer(config, prefix, model_config, cache_config, diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index b6ec740698..00a4fc6b5e 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -25,7 +25,7 @@ # # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py # """Inference-only DeepseekV2/DeepseekV3 model.""" -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import torch import torch_npu @@ -33,11 +33,11 @@ from torch import nn from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig, ModelConfig, VllmConfig -from vllm.distributed import (get_pp_group, +from vllm.config import (CacheConfig, ModelConfig, VllmConfig, + get_current_vllm_config) +from vllm.distributed import (get_dp_group, get_pp_group, get_tensor_model_parallel_world_size, get_tp_group) -from vllm.distributed.parallel_state import get_dp_group from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -65,7 +65,6 @@ from vllm.sequence import IntermediateTensors from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.distributed.parallel_state import get_ep_group from vllm_ascend.ops.fused_moe import AscendFusedMoE from vllm_ascend.quantization.quant_config import AscendLinearMethod from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod @@ -285,7 +284,10 @@ def __init__( self.tp_group = get_tp_group().device_group self.tp_rank = get_tp_group().rank_in_group - self.ep_group = get_ep_group() + self.kv_consumer = None + transfer_config = get_current_vllm_config().kv_transfer_config + if transfer_config is not None: + self.kv_consumer = transfer_config.kv_role == "kv_consumer" self.params_dtype = torch.get_default_dtype() @@ -293,23 +295,25 @@ def forward( self, hidden_states: torch.Tensor, attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: + forward_context = get_forward_context() if attn_metadata is None: - attn_metadata = get_forward_context().attn_metadata + attn_metadata = forward_context.attn_metadata + # when profile runs, force experts to load balanced tokens # to avoid high memory consumption on a single rank. - # TODO: need a better flag to indicate whether in profile run or not. - if attn_metadata is None: - # for profile run - is_prefill = True - enable_force_load_balance = True - else: - is_prefill = attn_metadata.num_prefills > 0 - enable_force_load_balance = False - if hasattr(attn_metadata, 'with_prefill_across_dp'): - is_prefill = is_prefill or attn_metadata.with_prefill_across_dp + enable_force_load_balance = forward_context.in_profile_run + + is_prefill = forward_context.with_prefill + # If this node is kv_consumer, we force the moe always runs in decode path to make sure + # the behaviour aligned between dummy_run and normal model_execute. + if self.kv_consumer: + is_prefill = False # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states) + if self.enable_multistream_moe: + router_logits = None + else: + router_logits, _ = self.gate(hidden_states) experts_hidden_states = self.experts( hidden_states=hidden_states, @@ -318,6 +322,7 @@ def forward( top_k=CustomDeepseekV2MoE.top_k, enable_force_load_balance=enable_force_load_balance, shared_experts=self.shared_experts, + gate=self.gate if self.enable_multistream_moe else None, ) hidden_states = ( @@ -477,7 +482,8 @@ def forward( hidden_states_or_q_c = self.q_a_layernorm(ckq) else: hidden_states_or_q_c = hidden_states - if self.torchair_graph_enabled: + is_mtp_model = attn_metadata is not None and attn_metadata.is_mtp_model + if self.torchair_graph_enabled and not is_mtp_model: forward_kwargs = {} if envs.VLLM_USE_V1: output_shape = hidden_states.shape @@ -736,7 +742,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if get_pp_group().is_last_rank: self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "lm_head")) else: self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) @@ -783,6 +791,14 @@ def get_all_moe_loads(self): def clear_all_moe_loads(self): for layer_id in range(self.num_moe_layers): self.model.layers[layer_id + self.num_dense_layers].mlp.experts.clear_moe_load() + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + weights = filter(lambda x: ".module." not in x[0], weights) + # weights = ((name, data) for name, data in weights if ".module." not in name) + loaded_params = super().load_weights(weights) + + return loaded_params + class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM): pass diff --git a/vllm_ascend/multistream/base.py b/vllm_ascend/multistream/base.py index fba58b460e..420839cde6 100644 --- a/vllm_ascend/multistream/base.py +++ b/vllm_ascend/multistream/base.py @@ -14,6 +14,8 @@ class MSEventKey(Enum): MOE_SE_COMM_FINISH = 6 MOE_SE_COMP_FINISH = 7 MOE_GATE_FINISH = 8 + MOE_ALL_TO_ALL = 9 + MOE_ALL_TO_ALL_FINISH = 10 @dataclass diff --git a/vllm_ascend/multistream/metadata.py b/vllm_ascend/multistream/metadata.py index b521d3f85f..e451f15f26 100644 --- a/vllm_ascend/multistream/metadata.py +++ b/vllm_ascend/multistream/metadata.py @@ -170,6 +170,8 @@ def make_multistream_metadata_ds( MSEventKey.MOE_SE_COMM_FINISH, MSEventKey.MOE_SE_COMP_FINISH, MSEventKey.MOE_GATE_FINISH, + MSEventKey.MOE_ALL_TO_ALL, + MSEventKey.MOE_ALL_TO_ALL_FINISH, ] return MultiStreamMetadata( calculate_stream=torch.npu.current_stream(), diff --git a/vllm_ascend/multistream/ms_split.py b/vllm_ascend/multistream/ms_split.py index 430f57b03a..fd32a18abb 100644 --- a/vllm_ascend/multistream/ms_split.py +++ b/vllm_ascend/multistream/ms_split.py @@ -96,10 +96,12 @@ def model_input_split_v1_mla_attn( seq_lens = attn_metadata.prefill.seq_lens if attn_metadata.num_prefills > 0 else attn_metadata.decode.seq_lens [seq_lens_pre, seq_lens_post] = split_attn_tensor_type(seq_lens, seq_index) - query_start_loc_pre = attn_metadata.query_start_loc[:seq_index + 1] - query_start_loc_post = deepcopy( - attn_metadata.query_start_loc[seq_index:] - ) - attn_metadata.query_start_loc[seq_index] + query_start_loc_pre = query_start_loc_post = None + if attn_metadata.query_start_loc is not None: + query_start_loc_pre = attn_metadata.query_start_loc[:seq_index + 1] + query_start_loc_post = deepcopy( + attn_metadata.query_start_loc[seq_index:] + ) - attn_metadata.query_start_loc[seq_index] [block_table_pre, block_table_post] = split_attn_tensor_type(attn_metadata.block_tables, seq_index) @@ -223,7 +225,7 @@ def model_input_split_v1_mla_attn( attn_mask=attn_mask_pre, prefill=prefill_pre, decode=decode_pre, - with_prefill_across_dp=attn_metadata.with_prefill_across_dp, + enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp, ) attention_metadata_post = _metadata_cls( num_actual_tokens=attn_metadata.num_actual_tokens - token_index, @@ -240,6 +242,6 @@ def model_input_split_v1_mla_attn( attn_state=attn_state_post, prefill=prefill_post, decode=decode_post, - with_prefill_across_dp=attn_metadata.with_prefill_across_dp, + enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp, ) return [attention_metadata_pre, attention_metadata_post] diff --git a/vllm_ascend/ops/attention.py b/vllm_ascend/ops/attention.py index 8037c9545b..05600aee7a 100644 --- a/vllm_ascend/ops/attention.py +++ b/vllm_ascend/ops/attention.py @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional +from typing import List, Optional, Tuple import torch from vllm.model_executor.layers.linear import ColumnParallelLinear @@ -37,7 +37,7 @@ def vanilla_chunked_prefill( scale: float, alibi_slopes: Optional[torch.Tensor], causal: bool = True, -) -> None: +) -> torch.Tensor: num_query_heads = query.shape[1] head_dim = value_cache.shape[3] num_kv_heads = value_cache.shape[2] @@ -138,7 +138,8 @@ def vanilla_chunked_prefill( def vanilla_chunked_prefill_mla( output: torch.Tensor, # (num_tokens, num_heads, v_head_dim) query: torch.Tensor, # (num_tokens, num_heads, nope_dim + rope_dim) - kv_cache: torch.Tensor, # (num_blocks, block_size, latent_kv) + kv_cache: Tuple[ + torch.Tensor], # [nope, rope] (num_blocks, block_size, latent_kv) block_tables: torch.Tensor, # (batch_size, max_num_blocks_per_seq) query_lens: torch.Tensor, # (batch_size) context_lens: torch.Tensor, # (batch_size) @@ -152,22 +153,25 @@ def vanilla_chunked_prefill_mla( alibi_slopes: Optional[torch.Tensor], causal: bool = True) -> None: batch_size = block_tables.size(0) + assert len(kv_cache) > 1 assert query_lens.size(0) == batch_size num_heads = query.size(1) - block_size = kv_cache.size(1) - latent_kv_dim = kv_cache.size(3) - rope_dim + nope_cache = kv_cache[0] + rope_cache = kv_cache[1] + block_size = nope_cache.size(1) + latent_kv_dim = nope_cache.size(-1) max_num_blocks_per_seq = block_tables.size(1) batch_size = query_lens.size(0) - kv_cache = kv_cache.squeeze() - # select kv_c out as [batch_size, max_context_len, latent_kv + rope_dim] - cache_kv_c_pe = kv_cache[block_tables].view( - batch_size, max_num_blocks_per_seq * block_size, - latent_kv_dim + rope_dim)[:, :max_context_len, :] - # get kv_c and k_pe + nope_cache = nope_cache.squeeze() + # select kv_c out as [batch_size, max_context_len, latent_kv + rope_dim] and get kv_c and k_pe # cached_kv_c: [batch_size, max_context_len, latent_kv] # cached_k_pe: [batch_size, max_context_len, rope_dim] - cache_kv_c = cache_kv_c_pe[:, :, :latent_kv_dim] - cache_k_pe = cache_kv_c_pe[:, :, latent_kv_dim:] + cache_kv_c = nope_cache[block_tables].view( + batch_size, max_num_blocks_per_seq * block_size, + latent_kv_dim)[:, :max_context_len, :] + cache_k_pe = rope_cache[block_tables].view( + batch_size, max_num_blocks_per_seq * block_size, + rope_dim)[:, :max_context_len, :] # get k_rope and v # k_nope: [batch_size, max_context_len, num_heads, nope_dim] # value: [batch_size, max_context_len, num_heads, v_head_dim] @@ -258,8 +262,8 @@ def vanilla_chunked_prefill_mla( attn_output = (attn_output[q_mask].view([-1, num_heads, v_head_dim]).to(output.dtype)) - output = output.view([-1, num_heads, v_head_dim]) - output.copy_(attn_output[:query.size(0) - num_add_query]) + attn_output = attn_output.view_as(output) + output.copy_(attn_output) return attn_output diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 0142e6b16b..e0819210d8 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -15,6 +15,7 @@ # This file is a part of the vllm-ascend project. # Adapted from vllm/tests/kernels/test_moe.py +import math import os from typing import Any, Callable, List, Optional, Tuple, Union @@ -26,7 +27,8 @@ from vllm.distributed import (GroupCoordinator, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) -from vllm.distributed.parallel_state import get_dp_group, get_tp_group +from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, + get_tp_group) from vllm.forward_context import get_forward_context from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEParallelConfig, MoEConfig, UnquantizedFusedMoEMethod, @@ -36,10 +38,10 @@ import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group +from vllm_ascend.ascend_forward_context import FusedMoEState from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer -from vllm_ascend.utils import (FusedMoEState, dispose_tensor, - get_fused_moe_state, npu_stream_switch, +from vllm_ascend.utils import (AscendSocVersion, dispose_tensor, + get_ascend_soc_version, npu_stream_switch, npu_wait_tensor) MOE_ALL2ALL_BUFFER: bool = envs_ascend.MOE_ALL2ALL_BUFFER @@ -118,9 +120,24 @@ def fused_experts_with_mc2( top_k: int, expert_map: torch.Tensor = None, moe_all_to_all_group_name: Optional[str] = None, - shared_experts: Optional[Any] = None + shared_experts: Optional[Any] = None, + is_torchair: bool = False, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - global_bs = 0 + quant_mode = 0 + ep_group = get_ep_group() + ep_rank_id = ep_group.rank_in_group + ep_world_size = ep_group.world_size + tp_world_size = get_tp_group().world_size + + # NOTE: `global_bs` should be equal to `max_num_tokens_across_dp` * `ep_world_size`, + # and `max_num_tokens_across_dp` has been split into `tp_world_size` parts before. + global_bs = math.ceil(get_forward_context().max_tokens_across_dp / + tp_world_size) * ep_world_size + + # NOTE: Currently, when in A3 or in torchair graph, we need to pass in some extra param into dispatch & combine + need_extra_args = (get_ascend_soc_version() == AscendSocVersion.A3 + or is_torchair) + moe_expert_num = len(expert_map) kwargs_mc2 = { "x": hidden_states, @@ -131,27 +148,20 @@ def fused_experts_with_mc2( "global_bs": global_bs, } - rank = torch.distributed.get_rank() - - quant_mode = 0 - ep_group = get_ep_group().device_group - local_rank = torch.distributed.get_rank(group=ep_group) - all_to_all_group_size = torch.distributed.get_world_size(ep_group) - - tp_size = get_etp_group().world_size - tp_rank = rank % tp_size - stage1_kwargs = { "scales": None, "quant_mode": quant_mode, "group_ep": moe_all_to_all_group_name, - "ep_world_size": all_to_all_group_size, - "ep_rank_id": local_rank, - # "group_tp": self.moe_rs_group_name, - "group_tp": moe_all_to_all_group_name, - "tp_world_size": tp_size, - "tp_rank_id": tp_rank, + "ep_world_size": ep_world_size, + "ep_rank_id": ep_rank_id, } + if need_extra_args: + stage1_kwargs.update({ + "group_tp": moe_all_to_all_group_name, + "tp_world_size": 1, + "tp_rank_id": 0, + }) + kwargs_mc2.update(stage1_kwargs) output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2) @@ -204,20 +214,22 @@ def fused_experts_with_mc2( "expert_shard_type": 0, "shared_expert_rank_num": 0, "moe_expert_num": moe_expert_num, - "global_bs": 0, + "global_bs": global_bs, } tp_recv_counts = output[5] stage3_kwargs = { "ep_send_counts": ep_recv_counts, "group_ep": moe_all_to_all_group_name, - "ep_world_size": all_to_all_group_size, - "ep_rank_id": local_rank, - "tp_send_counts": tp_recv_counts, - # "group_tp": self.moe_rs_group_name, - "group_tp": moe_all_to_all_group_name, - "tp_world_size": tp_size, - "tp_rank_id": tp_rank, + "ep_world_size": ep_world_size, + "ep_rank_id": ep_rank_id, } + if need_extra_args: + stage3_kwargs.update({ + "tp_send_counts": tp_recv_counts, + "group_tp": moe_all_to_all_group_name, + "tp_world_size": 1, + "tp_rank_id": 0, + }) kwargs_mc2.update(stage3_kwargs) hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2) @@ -847,17 +859,14 @@ def __init__(self, moe: MoEConfig = None): super().__init__(moe=moe) vllm_config = get_current_vllm_config() - self.ep_group = get_ep_group() - self.ep_size = self.ep_group.world_size self.global_batch_size = vllm_config.scheduler_config.max_num_seqs - self.local_batch_size = self.global_batch_size // self.ep_size self.max_model_len = vllm_config.model_config.max_model_len ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled try: - device_group = self.ep_group.device_group + device_group = get_ep_group().device_group # TODO: Try local_rank = ep_group.rank_in_group local_rank = torch.distributed.get_rank(group=device_group) backend = device_group._get_backend(torch.device("npu")) @@ -933,8 +942,7 @@ def apply( if enable_force_load_balance: topk_ids = torch.randint_like(topk_ids, 0, global_num_experts) - fused_moe_state = get_fused_moe_state(self.ep_group.world_size, - is_prefill) + fused_moe_state = get_forward_context().fused_moe_state if fused_moe_state == FusedMoEState.MC2: return fused_experts_with_mc2( hidden_states=x, @@ -945,7 +953,8 @@ def apply( top_k=top_k, expert_map=expert_map, moe_all_to_all_group_name=self.moe_all_to_all_group_name, - shared_experts=shared_experts) + shared_experts=shared_experts, + is_torchair=self.torchair_graph_enabled) elif fused_moe_state == FusedMoEState.AllGather: return fused_experts(hidden_states=x, w1=layer.w13_weight, @@ -1061,17 +1070,15 @@ def __init__( self.local_num_experts, self.expert_map = \ expert_load_balancer.get_rank_placement_map( self.moe_instance_id, - get_ep_group().rank_in_group) + self.ep_rank) self.log2phy = expert_load_balancer.get_rank_log2phy_map( - self.moe_instance_id, - get_ep_group().rank_in_group) + self.moe_instance_id, self.ep_rank) self.global_redundant_expert_num = \ expert_load_balancer.get_global_redundant_expert_num() else: # Create a tensor of size num_experts filled with -1 self.local_num_experts, self.expert_map = determine_expert_map( - self.ep_size, - get_ep_group().rank_in_group, self.global_num_experts) + self.ep_size, self.ep_rank, self.global_num_experts) self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled self.enable_multistream_moe = \ @@ -1118,7 +1125,6 @@ def __init__( in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod")): moe_quant_params["intermediate_size_full"] = intermediate_size - self.ep_group = get_ep_group() # NOTE: self.tp_group is not expert_tp_group self.tp_group = get_tp_group().device_group self.quant_method.create_weights(layer=self, **moe_quant_params) @@ -1129,7 +1135,8 @@ def forward(self, is_prefill: bool, enable_force_load_balance: bool = False, top_k: Optional[int] = None, - shared_experts: Optional[Any] = None): + shared_experts: Optional[Any] = None, + gate: Optional[Any] = None): assert self.quant_method is not None if top_k: @@ -1139,8 +1146,21 @@ def forward(self, num_tokens, hidden_size = hidden_states.shape - fused_moe_state = get_fused_moe_state(self.moe_parallel_config.ep_size, - is_prefill) + fused_moe_state = get_forward_context().fused_moe_state + # For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel. + quantized_x_for_share, dynamic_scale_for_share = None, None + from vllm_ascend.quantization.w8a8_dynamic import \ + AscendW8A8DynamicFusedMoEMethod + if self.enable_multistream_moe: + assert gate is not None + router_logits, _ = gate(hidden_states) + if isinstance(self.quant_method.quant_method, + AscendW8A8DynamicFusedMoEMethod + ) and fused_moe_state == FusedMoEState.MC2: + with npu_stream_switch("moe_secondary", 0): + quantized_x_for_share, dynamic_scale_for_share = torch_npu.npu_dynamic_quant( + hidden_states) + if shared_experts: if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2: shared_hidden_states = shared_experts(hidden_states) @@ -1164,16 +1184,15 @@ def forward(self, if self.dp_size > 1 and fused_moe_state == FusedMoEState.AllGather: # NOTE: When in torchair graph, it has been padded in model_runner_v1 if not self.torchair_graph_enabled or is_prefill: - attn_metadata = get_forward_context().attn_metadata - if attn_metadata is not None: - max_num_tokens_across_dp = attn_metadata.max_num_tokens_across_dp - if num_tokens < max_num_tokens_across_dp: - hidden_states = nn.functional.pad( - hidden_states, - (0, 0, 0, max_num_tokens_across_dp - num_tokens)) - router_logits = nn.functional.pad( - router_logits, - (0, 0, 0, max_num_tokens_across_dp - num_tokens)) + max_num_tokens_across_dp = get_forward_context( + ).max_tokens_across_dp + if num_tokens < max_num_tokens_across_dp: + hidden_states = nn.functional.pad( + hidden_states, + (0, 0, 0, max_num_tokens_across_dp - num_tokens)) + router_logits = nn.functional.pad( + router_logits, + (0, 0, 0, max_num_tokens_across_dp - num_tokens)) hidden_states = get_dp_group().all_gather(hidden_states, 0) router_logits = get_dp_group().all_gather(router_logits, 0) @@ -1198,15 +1217,16 @@ def forward(self, global_redundant_expert_num=self.global_redundant_expert_num, shared_experts=shared_experts if self.torchair_graph_enabled and self.enable_multistream_moe and not is_prefill else None, + quantized_x_for_share=quantized_x_for_share, + dynamic_scale_for_share=dynamic_scale_for_share, ) - if self.dynamic_eplb: - self.moe_load += expert_token_num if group_list_type else \ - torch.cat([expert_token_num[:1], expert_token_num[1:] - expert_token_num[:-1]]) - if shared_experts: if isinstance(e_hidden_states, tuple): e_hidden_states, shared_hidden_states = e_hidden_states + if self.dynamic_eplb: + self.moe_load += expert_token_num if group_list_type else \ + torch.cat([expert_token_num[:1], expert_token_num[1:] - expert_token_num[:-1]]) if tp_size > 1 and fused_moe_state != FusedMoEState.AllGather: dist.all_gather(list(chunk_hidden_states), e_hidden_states, diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 39a4c1cfe8..f55ab8e0cb 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -80,10 +80,7 @@ def native_rope_deepseek_forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, - offsets: Optional[torch.Tensor] = None, - max_seq_len: Optional[int] = None): - if max_seq_len is not None and max_seq_len > self.max_seq_len: - _set_cos_sin_cache(self, max_seq_len, query.device, query.dtype) + offsets: Optional[torch.Tensor] = None): if len(key.shape) == 2: key = key[:, None, :] # Note: we implement the non neox_style method with shuffle the last dim and neox style @@ -198,8 +195,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): return q_embed, k_embed -def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len +def _set_cos_sin_cache(self, max_seq_len, device, dtype): dim = self.rotary_dim freq_extra = 1.0 / (self.base**( @@ -219,9 +215,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask self.register_buffer("inv_freq", inv_freq, persistent=False) - t = torch.arange(seq_len * self.scaling_factor, - device=device, - dtype=torch.float32) + t = torch.arange(max_seq_len, device=device, dtype=torch.float32) freqs = torch.outer(t, inv_freq) cos_cached = torch.cat([freqs, freqs], dim=-1).cos() * self.mscale @@ -266,11 +260,10 @@ def deepseek_rope_init_func( super(DeepseekScalingRotaryEmbedding, self).__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype) - self.max_seq_len = max_position_embeddings - _set_cos_sin_cache(self, - max_position_embeddings, - dtype=dtype, - device="npu") + + # NOTE: For ascend friendly computing, reorder sin and cos cache + self.max_seq_len = max_position_embeddings * scaling_factor + _set_cos_sin_cache(self, self.max_seq_len, dtype=dtype, device="npu") RotaryEmbedding.forward_oot = rope_forward_oot diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index d817f9063e..ae87010359 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -47,7 +47,16 @@ # Related PR (if no, explain why): # Future Plan: # Remove those patch when vllm merged them -# 2. `vllm.config.ParallelConfig.get_next_dp_init_port` +# 2. `vllm.v1.engine.core.DPEngineCoreProc._init_data_parallel` +# Why: +# There is some bug for ASCEND_RT_VISIBLE_DEVICES usage. +# How: +# The ASCEND_RT_VISIBLE_DEVICES related code is dropped. +# Related PR (if no, explain why): +# No, this is a bug for vllm ascend +# Future Plan: +# Remove this patch once ASCEND_RT_VISIBLE_DEVICES bug is fixed. +# 3. `vllm.config.ParallelConfig.get_next_dp_init_port` # Why: # vllm doesn't support get port from environment. # How: @@ -56,7 +65,7 @@ # Need a PR to vllm to support get port from environment. # Future Plan: # Remove those patch when vllm merged them -# 3. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group` +# 4. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group` # Why: # vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to # get better performance @@ -65,7 +74,19 @@ # Related PR (if no, explain why): # Need a PR to vllm to support more backend. # Future Plan: -# Remove those patch when vllm support more backend. +# Remove those patch when vllm merged them +# +# ** File: platform/patch_common/patch_scheduler.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.v1.core.sched.scheduler.Scheduler.destroy_model_parallel()` +# Why: +# Vllm transfer the kv blocks data only when this block have already been full filled. However, this behaviour may cause decode node +# exist prefill behaviour. In order to make decode node work as expected, we always transfer all data whether or not the block is filled. +# How: +# The num_computed_token shall always equals to the token number of request during scheduling. +# Related PR (if no, explain why): https://github.com/vllm-project/vllm/pull/17751 (nixl implementation) +# Future Plan: +# No plan, we will maintain this patch util vllm change it behaviour # # * Worker Patch: # =============== @@ -100,18 +121,6 @@ # Future Plan: # Revert it when the related pr is merged in vllm and vllm-ascend. # -# 2. `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_include_gpu_probs_tensor` and -# `vllm.spec_decode.multi_step_worker.MultiStepWorker.set_should_modify_greedy_probs_inplace` -# Why: -# vLLM `Remove Sampler from Model Code` so vllm-ascend needs adapt to this change. -# How: -# Use vLLM 0.8.4 method to patch it. -# Related PR (if no, explain why): -# - https://github.com/vllm-project/vllm/pull/15195 -# - https://github.com/vllm-project/vllm-ascend/pull/395 -# Future Plan: -# Remove it when we identify the reasons clearly. -# # ** File: worker/patch_common/patch_spec_decode_worker.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker` diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_common/patch_distributed.py index 86515df86e..d971922840 100644 --- a/vllm_ascend/patch/platform/patch_common/patch_distributed.py +++ b/vllm_ascend/patch/platform/patch_common/patch_distributed.py @@ -17,8 +17,6 @@ # Adapted from vllm/model_executor/models/qwen2_vl.py # This file is a part of the vllm-ascend project. -import vllm -import vllm.distributed import vllm.envs as envs from torch.distributed import ProcessGroup from vllm.config import ParallelConfig @@ -26,25 +24,6 @@ stateless_init_torch_distributed_process_group -def ascend_destroy_model_parallel(): - """Set the groups to none and destroy them.""" - from vllm.distributed.parallel_state import _DP, _PP, _TP - if _TP: - _TP.destroy() - _TP = None - - if _PP: - _PP.destroy() - _PP = None - - if _DP: - _DP.destroy() - _DP = None - from vllm_ascend.distributed.parallel_state import \ - destory_ascend_model_parallel - destory_ascend_model_parallel() - - def parallel_config_get_dp_port(self) -> int: """ We might need to initialize process groups in multiple @@ -78,6 +57,5 @@ def stateless_init_dp_group(self) -> "ProcessGroup": return dp_group -vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port ParallelConfig.stateless_init_dp_group = stateless_init_dp_group diff --git a/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py b/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py index ca87729540..53ce312676 100644 --- a/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py +++ b/vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py @@ -88,20 +88,4 @@ def sampler_output( return filtered_model_outputs, True -def set_include_gpu_probs_tensor(self) -> None: - # Need include_gpu_probs_tensor for MultiSteoWorker - if hasattr(self.model_runner.model, "sampler"): - self.model_runner.model.sampler.include_gpu_probs_tensor = True - self.model_runner.sampler.include_gpu_probs_tensor = True - - -def set_should_modify_greedy_probs_inplace(self) -> None: - if hasattr(self.model_runner.model, "sampler"): - self.model_runner.model.sampler.should_modify_greedy_probs_inplace = ( - True) - self.model_runner.sampler.should_modify_greedy_probs_inplace = True - - MultiStepWorker.sampler_output = torch.inference_mode()(sampler_output) -MultiStepWorker.set_include_gpu_probs_tensor = set_include_gpu_probs_tensor -MultiStepWorker.set_should_modify_greedy_probs_inplace = set_should_modify_greedy_probs_inplace diff --git a/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py b/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py index 66e7aa56b2..d271e65bfc 100644 --- a/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py +++ b/vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py @@ -57,11 +57,6 @@ def create_worker( ngram_prompt_lookup_min = ( draft_worker_kwargs.pop("ngram_prompt_lookup_min")) - # TODO(Yizhou): A quick fix, must be refactored ASAP - draft_worker_kwargs["vllm_config"].parallel_config.expert_parallel_size = 1 - draft_worker_kwargs[ - "vllm_config"].parallel_config.expert_tensor_parallel_size = 1 - draft_model_config = draft_worker_kwargs["vllm_config"].model_config draft_parallel_config: ParallelConfig = draft_worker_kwargs[ 'vllm_config'].parallel_config @@ -72,6 +67,13 @@ def create_worker( proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min, ngram_prompt_lookup_max) else: + # TODO(Yizhou): A quick fix, must be refactored ASAP + # ngram need not this fix. + draft_worker_kwargs[ + "vllm_config"].parallel_config.expert_parallel_size = 1 + draft_worker_kwargs[ + "vllm_config"].parallel_config.expert_tensor_parallel_size = 1 + draft_tp = draft_parallel_config.tensor_parallel_size target_tp = scorer_worker.parallel_config.tensor_parallel_size diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index b9233da05d..a7e2bc0eea 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -125,17 +125,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config = vllm_config.cache_config if parallel_config: - # Default value for expert tensor parallel size - parallel_config.expert_tensor_parallel_size = parallel_config.tensor_parallel_size - - # NOTE: When enable_expert_parallel is True, we follow vLLM convention: - # ep_size = world_size, which means expert_tensor_parallel_size must be 1 if parallel_config.enable_expert_parallel: parallel_config.expert_tensor_parallel_size = 1 - # NOTE: When enable_expert_parallel is False and param `asceend_config.expert_tensor_parallel_size` - # is configured, use ascend_config - elif ascend_config.expert_tensor_parallel_size > 0: - parallel_config.expert_tensor_parallel_size = ascend_config.expert_tensor_parallel_size + else: + parallel_config.expert_tensor_parallel_size = parallel_config.world_size_across_dp # Calculate expert parallel size based on world size parallel_config.expert_parallel_size = ( @@ -149,14 +142,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: else: enforce_eager = getattr(model_config, "enforce_eager", False) - if ascend_config.torchair_graph_config.enabled and envs.VLLM_MLA_DISABLE: - # torchair_graph is not supported for V1 without mla currently. - logger.warning( - "Torchair graph mode is still experimental and not supported for V1 without mla currently, " - "Fallback to eager mode.") - ascend_config.torchair_graph_config.enabled = False - enforce_eager = True - check_ascend_config(vllm_config, enforce_eager) if enforce_eager or compilation_config.level == CompilationLevel.NO_COMPILATION: @@ -177,8 +162,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "PIECEWISE compilation enabled on NPU. use_inductor not supported - " "using only ACL Graph mode") compilation_config.use_inductor = False - compilation_config.splitting_ops.extend( - ["vllm.unified_ascend_attention_with_output"]) + if not compilation_config.full_cuda_graph: + compilation_config.splitting_ops.extend( + ["vllm.unified_ascend_attention_with_output"]) update_aclgraph_sizes(vllm_config) if parallel_config and parallel_config.worker_cls == "auto": diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index 3567dba355..1b06a4294a 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -34,6 +34,8 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.vocab_parallel_embedding import ( + UnquantizedEmbeddingMethod, VocabParallelEmbedding) from vllm.model_executor.parameter import PerTensorScaleParameter from vllm.model_executor.utils import set_weight_attrs @@ -104,6 +106,12 @@ def get_quant_method(self, layer: torch.nn.Module, return AscendUnquantizedFusedMoEMethod() return AscendFusedMoEMethod(self, prefix, self.packed_modules_mapping) + elif isinstance(layer, VocabParallelEmbedding): + if self.is_layer_skipped_ascend(prefix, + self.packed_modules_mapping): + return UnquantizedEmbeddingMethod() + return AscendEmbeddingMethod(self, prefix, + self.packed_modules_mapping) return None def is_layer_skipped_ascend( @@ -194,6 +202,17 @@ def create_weights( layer.register_parameter(perchannel_name, param) set_weight_attrs(param, extra_weight_attrs) + pergroup_dict = self.quant_method.get_pergroup_param( + input_size_per_partition, output_size_per_partition, params_dtype) + for pergroup_name, pergroup_param in pergroup_dict.items(): + param = torch.nn.Parameter(pergroup_param, requires_grad=False) + set_weight_attrs(param, {"output_dim": 0}) + layer.register_parameter(pergroup_name, param) + set_weight_attrs(param, extra_weight_attrs) + if "weight_scale_second" in pergroup_name or "weight_offset_second" in pergroup_name: + setattr(param, "input_dim", 1) + param.input_dim = 1 + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if hasattr(self.quant_method, "process_weights_after_loading"): self.quant_method.process_weights_after_loading(layer) @@ -305,6 +324,10 @@ def create_weights( param = torch.nn.Parameter(param_value, requires_grad=False) layer.register_parameter(param_key, param) set_weight_attrs(param, extra_weight_attrs) + if "weight_scale_second" in param_key or "weight_offset_second" in param_key: + setattr(param, "quant_method", + FusedMoeWeightScaleSupported.GROUP.value) + param.quant_method = FusedMoeWeightScaleSupported.GROUP.value def apply( self, @@ -337,3 +360,20 @@ def apply( def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if hasattr(self.quant_method, "process_weights_after_loading"): self.quant_method.process_weights_after_loading(layer) + + +class AscendEmbeddingMethod(AscendLinearMethod): + """Embedding method for Ascend quantization. + + This class calls AscendQuantizer to search a specific quantization + implementations supported on ascend hardware for Embedding methods. + + Args: + quant_config: The Ascend quantization config. + """ + + def __init__(self, quant_config: AscendQuantConfig, prefix: str, + packed_modules_mapping: Dict[str, Any]) -> None: + self.quantizer = AscendQuantizer.get_quantizer( + quant_config.quant_description, prefix, packed_modules_mapping) + self.quant_method = self.quantizer.build_linear_method() diff --git a/vllm_ascend/quantization/quantizer.py b/vllm_ascend/quantization/quantizer.py index ea1297bf35..d27914139f 100644 --- a/vllm_ascend/quantization/quantizer.py +++ b/vllm_ascend/quantization/quantizer.py @@ -24,6 +24,8 @@ from .func_wrapper import (wrapper_load_model, wrapper_rmsnorm_forward_oot, wrapper_rmsnorm_init) +from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod, + AscendW4A8DynamicLinearMethod) from .w8a8 import AscendW8A8LinearMethod from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod, AscendW8A8DynamicLinearMethod) @@ -263,6 +265,17 @@ def get_quantizer(cls, f"{list(SUPPORT_ASCEND_QUANTIZER_TYPE.keys())}") +class W4A8DYNAMICQuantizer(VLLMAscendQuantizer): + + @staticmethod + def build_linear_method(): + return AscendW4A8DynamicLinearMethod() + + @staticmethod + def build_moe_method(): + return AscendW4A8DynamicFusedMoEMethod() + + class W8A8Quantizer(VLLMAscendQuantizer): @staticmethod @@ -282,6 +295,7 @@ def build_moe_method(): SUPPORT_ASCEND_QUANTIZER_TYPE = { + "W4A8_DYNAMIC": W4A8DYNAMICQuantizer, "W8A8": W8A8Quantizer, "W8A8_DYNAMIC": W8A8DYNAMICQuantizer, } diff --git a/vllm_ascend/quantization/w4a8_dynamic.py b/vllm_ascend/quantization/w4a8_dynamic.py new file mode 100644 index 0000000000..227b6b680a --- /dev/null +++ b/vllm_ascend/quantization/w4a8_dynamic.py @@ -0,0 +1,377 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Callable, Dict, Optional + +import numpy as np +import torch +import torch_npu +from vllm.config import get_current_vllm_config +from vllm.distributed import get_ep_group +from vllm.forward_context import get_forward_context + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.ascend_forward_context import FusedMoEState +from vllm_ascend.ops.fused_moe import select_experts +from vllm_ascend.quantization.w8a8_dynamic import (fused_experts_with_all2all, + fused_experts_with_mc2) + + +class AscendW4A8DynamicLinearMethod: + """Linear method for Ascend W4A8_DYNAMIC + """ + + def __init__(self): + self.transpose_weight = True + self.group_size = get_current_vllm_config( + ).quant_config.quant_description.get("group_size", 256) + + @staticmethod + def get_weight(input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + params_dict = { + "weight": torch.empty(output_size, input_size, dtype=torch.int8) + } + return params_dict + + @staticmethod + def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + + @staticmethod + def get_perchannel_param(output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + + def get_pergroup_param(self, input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + params_dict = {} + params_dict["weight_scale"] = torch.empty(output_size, + 1, + dtype=params_dtype) + params_dict["weight_offset"] = torch.empty(output_size, + 1, + dtype=params_dtype) + params_dict["weight_scale_second"] = torch.empty(output_size, + input_size // + self.group_size, + dtype=params_dtype) + params_dict["weight_offset_second"] = torch.empty(output_size, + input_size // + self.group_size, + dtype=params_dtype) + return params_dict + + @staticmethod + def process_scale_second(weight: torch.Tensor, scale: torch.Tensor, + per_group_scale: torch.Tensor): + k, n = weight.shape + group_num, n = per_group_scale.shape + weight_high = weight.to(torch.float32).reshape( + group_num, -1, n) * per_group_scale.reshape(group_num, 1, n) + weight_high = weight_high.reshape(k, n) + bias = 8 * (weight_high.to(torch.float32) * scale).sum(dim=0) + antiquant_scale = (scale * per_group_scale).reshape(group_num, n) + return antiquant_scale.npu(), bias + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + tp_rank: Optional[int] = None, + ) -> torch.Tensor: + return torch_npu.npu_weight_quant_batchmatmul( + x, + layer.weight, + antiquant_scale=layer.weight_scale_second.to(x.dtype), + antiquant_group_size=self.group_size, + ) + + def process_weights_after_loading(self, layer: torch.nn.Module): + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight_scale.data = layer.weight_scale.data.flatten().to( + torch.float32) + layer.weight_offset.data = layer.weight_offset.data.flatten() + layer.weight_scale_second.data, scale_bias = self.process_scale_second( + layer.weight.data, + layer.weight_scale.data, + layer.weight_scale_second.data.transpose(0, 1).contiguous(), + ) + param = torch.nn.Parameter(scale_bias, requires_grad=False) + layer.register_parameter("weight_scale_bias", param) + layer.weight.data = torch_npu.npu_convert_weight_to_int4pack( + layer.weight.data.to(torch.int32)) + + +class AscendW4A8DynamicFusedMoEMethod: + """FusedMoe method for Ascend W4A8_DYNAMIC. + """ + + def __init__(self): + self.transpose_weight = True + + self.ep_group = get_ep_group() + + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + + try: + device_group = self.ep_group.device_group + # TODO: Try local_rank = ep_group.rank_in_group + local_rank = torch.distributed.get_rank(group=device_group) + backend = device_group._get_backend(torch.device("npu")) + self.moe_all_to_all_group_name = backend.get_hccl_comm_name( + local_rank) + except AttributeError: + self.moe_all_to_all_group_name = "" + + @staticmethod + def get_weight(num_experts: int, intermediate_size_per_partition: int, + hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + param_dict["w13_weight"] = torch.empty(num_experts, + 2 * + intermediate_size_per_partition, + hidden_sizes, + dtype=torch.int8) + param_dict["w2_weight"] = torch.empty(num_experts, + hidden_sizes, + intermediate_size_per_partition, + dtype=torch.int8) + return param_dict + + @staticmethod + def get_dynamic_quant_param(num_experts: int, + intermediate_size_per_partition: int, + hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + config = get_current_vllm_config() + group_size = config.quant_config.quant_description.get( + "group_size", 256) + + param_dict["w13_weight_scale"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=params_dtype) + + param_dict["w13_weight_offset"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=params_dtype) + + param_dict["w13_weight_scale_second"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_sizes // group_size, + dtype=params_dtype) + + param_dict["w13_weight_offset_second"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_sizes // group_size, + dtype=params_dtype) + + param_dict["w2_weight_scale"] = torch.empty(num_experts, + hidden_sizes, + 1, + dtype=params_dtype) + param_dict["w2_weight_offset"] = torch.empty(num_experts, + hidden_sizes, + 1, + dtype=params_dtype) + param_dict["w2_weight_scale_second"] = torch.empty( + num_experts, + hidden_sizes, + intermediate_size_per_partition // group_size, + dtype=params_dtype) + param_dict["w2_weight_offset_second"] = torch.empty( + num_experts, + hidden_sizes, + intermediate_size_per_partition // group_size, + dtype=params_dtype) + + return param_dict + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + is_prefill: bool = True, + enable_force_load_balance: bool = True, + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, + **kwargs, + ) -> torch.Tensor: + assert router_logits.shape[ + 1] == global_num_experts, "Number of global experts mismatch" + + # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern + if global_num_experts == 256: + topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k( + router_logits, + k=top_k, # topk当前写8 + bias=e_score_correction_bias, + k_group=topk_group, # fix: 4 + group_count=num_expert_group, # fix 8 + group_select_mode=1, # 0: group中的最大; 1: topk2.sum(fix) + renorm=0, # 0: softmax->topk(fix); 1: topk->softmax + norm_type=1, # 0: softmax; 1: sigmoid(fix) + # out_flag=False, # todo new api; 第三个输出是否输出 + # y2_flag=False, # old api; 第三个输出是否输出 + routed_scaling_factor=1, + eps=float(1e-20)) + else: + topk_weights, topk_ids = select_experts( + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + ) + + # this is a naive implementation for experts load balance so as + # to avoid accumulating too much tokens on a single rank. + # currently it is only activated when doing profile runs. + if enable_force_load_balance: + topk_ids = torch.randint_like(topk_ids, 0, global_num_experts) + + topk_weights = topk_weights.to(x.dtype) + + fused_moe_state = get_forward_context().fused_moe_state + if fused_moe_state == FusedMoEState.MC2: + return fused_experts_with_mc2( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + w1_scale=layer.w13_weight_scale_second, + w2_scale=layer.w2_weight_scale_second, + w1_scale_bias=layer.w13_scale_bias, + w2_scale_bias=layer.w2_scale_bias, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + expert_map=expert_map, + moe_all_to_all_group_name=self.moe_all_to_all_group_name, + log2phy=log2phy, + global_redundant_expert_num=global_redundant_expert_num, + shared_experts=shared_experts, + is_torchair=self.torchair_graph_enabled) + else: + # The current implementation of deepseek moe splits hidden_states + # according to tp_size before they are feed into fused_moe module. + # Therefore, all2all is needed no matter how dp/tp is set so as to + # dispatch/combine tokens. + return fused_experts_with_all2all( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + w1_scale=layer.w13_weight_scale_second, + w2_scale=layer.w2_weight_scale_second, + w1_scale_bias=layer.w13_scale_bias, + w2_scale_bias=layer.w2_scale_bias, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + expert_map=expert_map, + ep_group=self.ep_group, + log2phy=log2phy, + global_redundant_expert_num=global_redundant_expert_num, + ) + + def process_scale(self, weight: torch.Tensor, scale, per_group_scale): + group_num, k, n = weight.shape + per_group_scale = per_group_scale.reshape(group_num, -1, n) + group_num, quantgroup_num, n = per_group_scale.shape + weight_high = weight.to(torch.float32).reshape([group_num, quantgroup_num, -1, n]) * \ + per_group_scale.reshape([group_num, quantgroup_num, 1, n]) + weight_high = weight_high.reshape([group_num, k, n]) + bias = 8 * (weight_high.to(torch.float32) * scale).sum(axis=1) + scale_fp32 = (scale * per_group_scale).to(torch.float16).to( + torch.float32) + scale_fp32_np = scale_fp32.cpu().numpy() + scale_fp32_np.dtype = np.uint32 + sscale_uint64 = np.zeros((group_num, quantgroup_num, n * 2), + dtype=np.uint32) + + sscale_uint64[..., ::2] = scale_fp32_np + + sscale_uint64_buffer = np.frombuffer(sscale_uint64.tobytes(), + dtype=np.int64).copy() + sscale_uint64_tensor = torch.from_numpy(sscale_uint64_buffer).reshape( + group_num, quantgroup_num, n) + sscale_uint64_tensor = sscale_uint64_tensor.npu() + return sscale_uint64_tensor, bias + + def process_weights_after_loading(self, layer): + if self.transpose_weight: + layer.w13_weight.data = layer.w13_weight.data.transpose( + 1, 2).contiguous() + layer.w2_weight.data = layer.w2_weight.data.transpose( + 1, 2).contiguous() + layer.w13_weight_scale.data = layer.w13_weight_scale.data.transpose( + 1, 2).contiguous() + layer.w2_weight_scale.data = layer.w2_weight_scale.data.transpose( + 1, 2).contiguous() + layer.w13_weight_offset.data = layer.w13_weight_offset.data.view( + layer.w13_weight_offset.data.shape[0], -1) + layer.w2_weight_offset.data = layer.w2_weight_offset.data.view( + layer.w2_weight_offset.data.shape[0], -1) + layer.w13_weight_scale_second.data = layer.w13_weight_scale_second.data.transpose( + 1, 2).contiguous() + layer.w2_weight_scale_second.data = layer.w2_weight_scale_second.data.transpose( + 1, 2).contiguous() + + layer.w13_weight_scale_second.data, bias = self.process_scale( + layer.w13_weight, layer.w13_weight_scale.data, + layer.w13_weight_scale_second.data) + param = torch.nn.Parameter(bias, requires_grad=False) + layer.register_parameter("w13_scale_bias", param) + layer.w2_weight_scale_second.data, bias1 = self.process_scale( + layer.w2_weight, layer.w2_weight_scale.data, + layer.w2_weight_scale_second.data) + param = torch.nn.Parameter(bias1, requires_grad=False) + layer.register_parameter("w2_scale_bias", param) + + layer.w13_weight.data = torch_npu.npu_quantize( + layer.w13_weight.data.to(torch.float32), + torch.tensor([1.]).npu(), None, torch.quint4x2, -1, False) + layer.w2_weight.data = torch_npu.npu_quantize( + layer.w2_weight.data.to(torch.float32), + torch.tensor([1.]).npu(), None, torch.quint4x2, -1, False) diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py index db23cb024d..28925034c1 100644 --- a/vllm_ascend/quantization/w8a8.py +++ b/vllm_ascend/quantization/w8a8.py @@ -20,6 +20,9 @@ import torch import torch_npu +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ + def quant_per_tensor(in_tensor: torch.Tensor, input_scale: torch.Tensor, input_offset: torch.Tensor): @@ -37,6 +40,8 @@ class AscendW8A8LinearMethod: def __init__(self) -> None: # aclnn quant matmul requires to transpose matrix B, set to true by default. self.transpose_weight = True + ascend_config = get_ascend_config() + self.enable_weight_nz_layout = ascend_config.enable_weight_nz_layout @staticmethod def get_weight( @@ -77,6 +82,10 @@ def get_perchannel_param( dtype=params_dtype) return params_dict + def get_pergroup_param(self, input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + @staticmethod def apply( layer: torch.nn.Module, @@ -110,6 +119,9 @@ def process_weights_after_loading(self, layer): requires_grad=False).to(layer.aclnn_input_scale.dtype) if self.transpose_weight: layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - layer.weight.data = torch_npu.npu_format_cast(layer.weight.data, 29) + if self.enable_weight_nz_layout: + # cast quantized weight tensors in NZ layout for higher inference speed + layer.weight.data = torch_npu.npu_format_cast( + layer.weight.data, ACL_FORMAT_FRACTAL_NZ) layer.weight_scale.data = torch.flatten(layer.weight_scale.data) layer.weight_offset.data = torch.flatten(layer.weight_offset.data) diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 4f165ed3eb..8066bab4ed 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -15,19 +15,95 @@ # limitations under the License. # -from typing import Any, Callable, Dict, Optional, Tuple, Union +import math +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch import torch.distributed as dist import torch_npu -from vllm.distributed import GroupCoordinator +from vllm.distributed import GroupCoordinator, get_ep_group, get_tp_group +from vllm.forward_context import get_forward_context from vllm_ascend.ascend_config import get_ascend_config -from vllm_ascend.distributed.parallel_state import get_ep_group +from vllm_ascend.ascend_forward_context import FusedMoEState from vllm_ascend.ops.fused_moe import select_experts -from vllm_ascend.utils import (FusedMoEState, dispose_tensor, - get_fused_moe_state, npu_stream_switch, - npu_wait_tensor) +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendSocVersion, + dispose_tensor, get_ascend_soc_version, + npu_stream_switch, npu_wait_tensor) + + +def apply_mlp_decode(hidden_states_wrapper: List[torch.Tensor], + w1: torch.Tensor, + w1_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + group_list: torch.Tensor, + dynamic_scale: torch.Tensor = None, + group_list_type: int = 1) -> torch.Tensor: + """ + apply MLP: gate_up_proj -> swiglu -> down_proj + Args: + hidden_states_wrapper: wrapper of input hidden states with shape (num_tokens, hidden_size). + w1: expert weights1 with shape + (num_experts, hidden_size, intermediate_size * 2) + w1_scale: weights1 scale with shape (num_experts, intermediate_size * 2) + w2: expert weights2 with shape + (num_experts, intermediate_size, hidden_size) + w2_scale: weights2 scale with shape (num_experts, hidden_size) + group_list: number of tokens for each expert, follow cumsum mode, and + with shape (num_experts). + transpose_weight: + w1: (num_experts, intermediate_size * 2, hidden_size) -> + (num_experts, hidden_size, intermediate_size * 2) + w2: (num_experts, hidden_size, intermediate_size) -> + (num_experts, intermediate_size, hidden_size) + Returns: + hidden_states: output hidden states after MLP. + """ + + assert len(hidden_states_wrapper) == 1 + hidden_states = hidden_states_wrapper.pop() + if dynamic_scale is None: + hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant( + hidden_states) + else: + pertoken_scale = dynamic_scale + + # gmm1: gate_up_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w1], + split_item=3, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=torch.int32)[0] + + # act_fn: swiglu + hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant( + x=hidden_states, + weight_scale=w1_scale, + activation_scale=pertoken_scale, + bias=None, + quant_scale=None, + quant_offset=None, + group_index=group_list, + activate_left=True, + quant_mode=1, + ) + + # gmm2: down_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w2], + scale=[w2_scale], + per_token_scale=[swiglu_out_scale], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=w2_scale.dtype)[0] + return hidden_states def apply_mlp(hidden_states: torch.Tensor, @@ -37,7 +113,9 @@ def apply_mlp(hidden_states: torch.Tensor, w2_scale: torch.Tensor, group_list: torch.Tensor, dynamic_scale: torch.Tensor = None, - group_list_type: int = 1) -> torch.Tensor: + group_list_type: int = 1, + w1_scale_bias: torch.Tensor = None, + w2_scale_bias: torch.Tensor = None) -> torch.Tensor: """ apply MLP: gate_up_proj -> swiglu -> down_proj @@ -71,17 +149,31 @@ def apply_mlp(hidden_states: torch.Tensor, else: pertoken_scale = dynamic_scale + bias1, bias2 = None, None + _output_dtype = w2_scale.dtype + + if w1_scale_bias is not None: + if group_list_type == 0: + group_list = torch.cat( + [group_list[:1], torch.diff(group_list, dim=0)]) + group_list_type = 1 + bias1 = [w1_scale_bias] + bias2 = [w2_scale_bias] + # TODO w4a8 scene: dynamic acquisition of dtype in the future + _output_dtype = torch.bfloat16 + # gmm1: gate_up_proj hidden_states = torch_npu.npu_grouped_matmul( x=[hidden_states], weight=[w1], scale=[w1_scale], + bias=bias1, per_token_scale=[pertoken_scale], split_item=2, group_list_type=group_list_type, group_type=0, group_list=group_list, - output_dtype=w2_scale.dtype)[0] + output_dtype=_output_dtype)[0] # act_fn: swiglu hidden_states = torch_npu.npu_swiglu(hidden_states) @@ -93,12 +185,13 @@ def apply_mlp(hidden_states: torch.Tensor, x=[hidden_states], weight=[w2], scale=[w2_scale], + bias=bias2, per_token_scale=[swiglu_out_scale], split_item=2, group_list_type=group_list_type, group_type=0, group_list=group_list, - output_dtype=w2_scale.dtype)[0] + output_dtype=_output_dtype)[0] return hidden_states @@ -117,11 +210,33 @@ def fused_experts_with_mc2( log2phy: torch.Tensor = None, global_redundant_expert_num: int = 0, shared_experts: Optional[Any] = None, + is_torchair: bool = False, + w1_scale_bias: torch.Tensor = None, + w2_scale_bias: torch.Tensor = None, + quantized_x_for_share: Optional[Any] = None, + dynamic_scale_for_share: Optional[Any] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - if log2phy is not None: + if log2phy: topk_ids = log2phy[topk_ids] - global_bs = 0 - moe_expert_num = len(expert_map) + global_redundant_expert_num + quant_mode = 2 + ep_group = get_ep_group() + ep_rank_id = ep_group.rank_in_group + ep_world_size = ep_group.world_size + tp_world_size = get_tp_group().world_size + + # NOTE: `global_bs` should be equal to `max_num_tokens_across_dp` * `ep_world_size`, + # and `max_num_tokens_across_dp` has been split into `tp_world_size` parts before. + global_bs = math.ceil(get_forward_context().max_tokens_across_dp / + tp_world_size) * ep_world_size + + # NOTE: Currently, when in A3 or in torchair graph, we need to pass in some extra param into dispatch & combine + need_extra_args = (get_ascend_soc_version() == AscendSocVersion.A3 + or is_torchair) + + if (expert_map is not None): + moe_expert_num = len(expert_map) + global_redundant_expert_num + else: + moe_expert_num = global_redundant_expert_num # hidden_states = hidden_states.bfloat16() kwargs_mc2 = { "x": hidden_states, @@ -130,53 +245,43 @@ def fused_experts_with_mc2( "shared_expert_rank_num": 0, "moe_expert_num": moe_expert_num, "global_bs": global_bs, - "expert_scales": topk_weights.to(torch.float32), } - rank = torch.distributed.get_rank() - - quant_mode = 2 - ep_group = get_ep_group().device_group - local_rank = torch.distributed.get_rank(group=ep_group) - all_to_all_group_size = torch.distributed.get_world_size(ep_group) - - world_szie = torch.distributed.get_world_size() - tp_size = world_szie // all_to_all_group_size - tp_rank = rank % tp_size - stage1_kwargs = { "scales": None, "quant_mode": quant_mode, "group_ep": moe_all_to_all_group_name, - "ep_world_size": all_to_all_group_size, - "ep_rank_id": local_rank, - # "group_tp": self.moe_rs_group_name, - "group_tp": moe_all_to_all_group_name, - "tp_world_size": tp_size, - "tp_rank_id": tp_rank, + "ep_world_size": ep_world_size, + "ep_rank_id": ep_rank_id, } + if need_extra_args: + stage1_kwargs.update({ + "group_tp": moe_all_to_all_group_name, + "tp_world_size": 1, + "tp_rank_id": 0, + }) kwargs_mc2.update(stage1_kwargs) output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2) # comm_stream.wait_stream(torch.npu.current_stream()) - expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts, _, expand_scales = output[ - 0:7] + expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[ + 0:5] if shared_experts is not None: with npu_stream_switch("moe_secondary", 0): - npu_wait_tensor(hidden_states, topk_weights) - shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states) - npu_wait_tensor(shared_gate_up[0], expand_x) - shared_act = shared_experts.act_fn(shared_gate_up) + npu_wait_tensor(quantized_x_for_share, expand_x) + shared_act_out = shared_experts.act_fn( + (quantized_x_for_share, dynamic_scale_for_share)) + shared_act, swiglu_out_scale = shared_act_out[0], shared_act_out[1] # `expand_x` will be disposed in the `apply_mlp` function - down_out_list = apply_mlp(expand_x, - w1, - w1_scale, - w2, - w2_scale, - expert_token_nums, - dynamic_scale=dynamic_scale) + down_out_list = apply_mlp_decode([expand_x], + w1, + w1_scale, + w2, + w2_scale, + expert_token_nums, + dynamic_scale=dynamic_scale) # moeCombine kwargs_mc2 = { @@ -187,8 +292,7 @@ def fused_experts_with_mc2( "expert_shard_type": 0, "shared_expert_rank_num": 0, "moe_expert_num": moe_expert_num, - "global_bs": 0, - "expand_scales": expand_scales, + "global_bs": global_bs, } tp_recv_counts = torch.empty(1, dtype=torch.int32, @@ -196,14 +300,16 @@ def fused_experts_with_mc2( stage3_kwargs = { "ep_send_counts": ep_recv_counts, "group_ep": moe_all_to_all_group_name, - "ep_world_size": all_to_all_group_size, - "ep_rank_id": local_rank, - "tp_send_counts": tp_recv_counts, - # "group_tp": self.moe_rs_group_name, - "group_tp": moe_all_to_all_group_name, - "tp_world_size": tp_size, - "tp_rank_id": tp_rank, + "ep_world_size": ep_world_size, + "ep_rank_id": ep_rank_id, } + if need_extra_args: + stage3_kwargs.update({ + "tp_send_counts": tp_recv_counts, + "group_tp": moe_all_to_all_group_name, + "tp_world_size": 1, + "tp_rank_id": 0, + }) kwargs_mc2.update(stage3_kwargs) hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2) @@ -213,28 +319,29 @@ def fused_experts_with_mc2( return hidden_states, expert_token_nums, group_list_type else: with npu_stream_switch("moe_secondary", 0): - npu_wait_tensor(shared_act[0], down_out_list) - shared_output, _ = shared_experts.down_proj(shared_act) + npu_wait_tensor(shared_act, down_out_list) + shared_output, _ = shared_experts.down_proj( + (shared_act, swiglu_out_scale)) return hidden_states, shared_output, expert_token_nums, group_list_type # currently expert parallelism implemented with all2all # is under-optimized. -def fused_experts_with_all2all( - hidden_states: torch.Tensor, - w1: torch.Tensor, - w1_scale: torch.Tensor, - w2: torch.Tensor, - w2_scale: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - top_k: int, - expert_map: torch.Tensor = None, - ep_group: GroupCoordinator = None, - log2phy: torch.Tensor = None, - global_redundant_expert_num: int = 0, -): - if log2phy is not None: +def fused_experts_with_all2all(hidden_states: torch.Tensor, + w1: torch.Tensor, + w1_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None, + ep_group: GroupCoordinator = None, + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, + w1_scale_bias: torch.Tensor = None, + w2_scale_bias: torch.Tensor = None): + if log2phy: topk_ids = log2phy[topk_ids] original_shape = hidden_states.shape if len(original_shape) == 3: @@ -312,7 +419,9 @@ def fused_experts_with_all2all( w2, w2_scale, expert_tokens, #16 - group_list_type=group_list_type) + group_list_type=group_list_type, + w1_scale_bias=w1_scale_bias, + w2_scale_bias=w2_scale_bias) if expert_map is not None: resorted_idx = torch.argsort(sorted_idx) @@ -467,6 +576,8 @@ class AscendW8A8DynamicLinearMethod: def __init__(self): self.transpose_weight = True + ascend_config = get_ascend_config() + self.enable_weight_nz_layout = ascend_config.enable_weight_nz_layout @staticmethod def get_weight(input_size: int, output_size: int, @@ -494,6 +605,10 @@ def get_perchannel_param( dtype=params_dtype) return params_dict + def get_pergroup_param(self, input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + @staticmethod def apply( layer: torch.nn.Module, @@ -528,8 +643,10 @@ def apply( def process_weights_after_loading(self, layer): if self.transpose_weight: layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() - # cast quantized weight tensors in NZ format (29) for higher inference speed - layer.weight.data = torch_npu.npu_format_cast(layer.weight.data, 29) + if self.enable_weight_nz_layout: + # cast quantized weight tensors in NZ layout for higher inference speed + layer.weight.data = torch_npu.npu_format_cast( + layer.weight.data, ACL_FORMAT_FRACTAL_NZ) layer.weight_scale.data = layer.weight_scale.data.flatten() layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32) layer.weight_offset.data = layer.weight_offset.data.flatten() @@ -546,6 +663,7 @@ def __init__(self): ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_weight_nz_layout = ascend_config.enable_weight_nz_layout try: device_group = self.ep_group.device_group @@ -619,6 +737,8 @@ def apply( log2phy: torch.Tensor = None, global_redundant_expert_num: int = 0, shared_experts: Optional[Any] = None, + quantized_x_for_share: Optional[Any] = None, + dynamic_scale_for_share: Optional[Any] = None, **kwargs, ) -> torch.Tensor: assert router_logits.shape[ @@ -653,6 +773,16 @@ def apply( e_score_correction_bias=e_score_correction_bias, ) + fused_moe_state = get_forward_context().fused_moe_state + shared_gate_up, shared_dequant_scale = None, None + if shared_experts is not None and fused_moe_state == FusedMoEState.MC2: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(quantized_x_for_share, router_logits) + share_up_out, _ = shared_experts.gate_up_proj( + (quantized_x_for_share, dynamic_scale_for_share)) + shared_gate_up, shared_dequant_scale = share_up_out[ + 0], share_up_out[1] + # this is a naive implementation for experts load balance so as # to avoid accumulating too much tokens on a single rank. # currently it is only activated when doing profile runs. @@ -661,14 +791,12 @@ def apply( topk_weights = topk_weights.to(x.dtype) - fused_moe_state = get_fused_moe_state(self.ep_group.world_size, - is_prefill) if fused_moe_state == FusedMoEState.MC2: return fused_experts_with_mc2( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, - w1_scale=layer.w13_weight_scale, + w1_scale=layer.w13_weight_scale_fp32, w2_scale=layer.w2_weight_scale, topk_weights=topk_weights, topk_ids=topk_ids, @@ -678,6 +806,9 @@ def apply( log2phy=log2phy, global_redundant_expert_num=global_redundant_expert_num, shared_experts=shared_experts, + is_torchair=self.torchair_graph_enabled, + quantized_x_for_share=shared_gate_up, + dynamic_scale_for_share=shared_dequant_scale, **kwargs) elif fused_moe_state == FusedMoEState.AllGather: return fused_experts(hidden_states=x, @@ -715,8 +846,16 @@ def process_weights_after_loading(self, layer): 1, 2).contiguous() layer.w2_weight.data = layer.w2_weight.data.transpose( 1, 2).contiguous() + if self.enable_weight_nz_layout: + # cast quantized weight tensors in NZ layout for higher inference speed + layer.w13_weight.data = torch_npu.npu_format_cast( + layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ) + layer.w2_weight.data = torch_npu.npu_format_cast( + layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ) layer.w13_weight_scale.data = layer.w13_weight_scale.data.view( layer.w13_weight_scale.data.shape[0], -1) + layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to( + torch.float32) layer.w13_weight_offset.data = layer.w13_weight_offset.data.view( layer.w13_weight_offset.data.shape[0], -1) layer.w2_weight_scale.data = layer.w2_weight_scale.data.view( diff --git a/vllm_ascend/soc_info.py b/vllm_ascend/soc_info.py new file mode 100644 index 0000000000..ac1317e8e1 --- /dev/null +++ b/vllm_ascend/soc_info.py @@ -0,0 +1,14 @@ +from dataclasses import dataclass + +import torch_npu + + +@dataclass +class NPUSocInfo: + is_a3: bool = False + + def __post_init__(self): + torch_npu.npu._lazy_init() + self.soc_version = torch_npu._C._npu_get_soc_version() + if self.soc_version in (250, 251, 252, 253, 254, 255): + self.is_a3 = True diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index eeab287906..f7ca0aba2e 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -20,12 +20,13 @@ import atexit import math from contextlib import contextmanager, nullcontext +from dataclasses import dataclass from enum import Enum from threading import Lock -from typing import TYPE_CHECKING, List, Tuple +from typing import TYPE_CHECKING, List, Optional, Tuple import torch -import torch_npu # noqa: F401 +import torch_npu import torchair # type: ignore[import] # noqa: F401 from packaging.version import InvalidVersion, Version from torch_npu.npu.streams import Event @@ -57,6 +58,9 @@ CUSTOM_OP_ENABLED = None +ACL_FORMAT_ND = 2 +ACL_FORMAT_FRACTAL_NZ = 29 + def try_register_lib(lib_name: str, lib_info: str = ""): import importlib @@ -168,6 +172,27 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: original_sizes, compilation_config.cudagraph_capture_sizes = \ compilation_config.cudagraph_capture_sizes, None + if compilation_config.full_cuda_graph: + max_num_seqs = vllm_config.scheduler_config.max_num_seqs + truncated_sizes = [x for x in original_sizes if x <= max_num_seqs] + compilation_config.init_with_cudagraph_sizes(truncated_sizes) + + warning_message = """\033[91m + ********************************************************************************** + * WARNING: You have enabled the *full graph* feature. + * This is an early experimental stage and may involve various unknown issues. + * A known problem is that capturing too many batch sizes can lead to OOM + * (Out of Memory) errors or inference hangs. If you encounter such issues, + * consider reducing `gpu_memory_utilization` or manually specifying a smaller + * batch size for graph capture. + * For more details, please refer to: + * https://docs.vllm.ai/en/stable/configuration/conserving_memory.html#reduce-cuda-graphs + **********************************************************************************\033[0m + """ + + logger.warning(warning_message) + return + # Calculate parallel configuration factor num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers parallel_config = vllm_config.parallel_config @@ -278,19 +303,58 @@ def npu_wait_tensor(self: torch.Tensor, return _npu_wait_tensor(self, dependency) if enabled else self -# TODO(zzzzwwjj): move this into forward_context -class FusedMoEState(Enum): - AllGather = 0 - All2All = 1 - MC2 = 2 +class AscendSocVersion(Enum): + A2 = 0 + A3 = 1 + MAX = 2 + + +_ascend_soc_version = None -# TODO(zzzzwwjj): add soc_version to choose branch -def get_fused_moe_state(ep_size: int, with_prefill: bool): - if ep_size == 1: - return FusedMoEState.AllGather - # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph. - elif ep_size < 16 or with_prefill: - return FusedMoEState.All2All +def init_ascend_soc_version(): + soc_version = torch_npu.npu.get_soc_version() + global _ascend_soc_version + if 220 <= soc_version <= 225: + _ascend_soc_version = AscendSocVersion.A2 + elif 250 <= soc_version <= 255: + _ascend_soc_version = AscendSocVersion.A3 else: - return FusedMoEState.MC2 + _ascend_soc_version = AscendSocVersion.MAX + + +def get_ascend_soc_version(): + global _ascend_soc_version + assert _ascend_soc_version is not None + return _ascend_soc_version + + +@dataclass +class GraphParams: + events: dict[int, list[torch.npu.ExternalEvent]] + workspaces: dict[int, torch.Tensor] + handles: dict[int, list[torch_npu._C._NPUTaskGroupHandle]] + attn_params: dict[int, list[tuple]] + + +_graph_params: Optional[GraphParams] = None + + +def set_graph_params(aclgraph_capture_sizes: set[int]): + global _graph_params + if _graph_params is not None: + raise ValueError("Graph parameters have already been set!") + _graph_params = GraphParams( + {size: [] + for size in aclgraph_capture_sizes}, + {size: None + for size in aclgraph_capture_sizes}, + {size: [] + for size in aclgraph_capture_sizes}, + {size: [] + for size in aclgraph_capture_sizes}, + ) + + +def get_graph_params(): + return _graph_params diff --git a/vllm_ascend/worker/draft_model_runner.py b/vllm_ascend/worker/draft_model_runner.py index 1306b1e160..bfd513d5fe 100644 --- a/vllm_ascend/worker/draft_model_runner.py +++ b/vllm_ascend/worker/draft_model_runner.py @@ -18,7 +18,6 @@ from typing import List, Optional import torch -from vllm.forward_context import set_forward_context from vllm.logger import logger from vllm.model_executor.layers.sampler import SamplerOutput from vllm.multimodal import MultiModalKwargs @@ -27,6 +26,7 @@ ModelRunnerInputBase, ModelRunnerWrapperBase) +from vllm_ascend.ascend_forward_context import set_ascend_forward_context from vllm_ascend.attention.attention import AscendMetadata # A flag to enable debug prints for the updated input tensors @@ -51,12 +51,6 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase): """ def __init__(self, model_runner: ModelRunnerBase): - if hasattr( - model_runner, - "return_hidden_states") and model_runner.return_hidden_states: - raise ValueError( - "return_hidden_states is not supported for TP1DraftModelRunner." - ) super().__init__(model_runner) self.indices_of_seq_with_bonus_tokens = None @@ -211,6 +205,9 @@ def execute_model( if self.prompt_adapter_config is not None: raise ValueError("TP1DraftModelRunner has no support for " "prompt_adapter_config") + if model_input.inputs_embeds is not None: + raise ValueError("TP1DraftModelRunner has no support for " + "inputs_embeds") if model_input.multi_modal_kwargs: raise ValueError( "TP1DraftModelRunner has no support for multi_modal_kwargs" @@ -264,14 +261,15 @@ def execute_model( spec_step_idx = kwargs.get("spec_step_idx", step) model_execute_kwargs["spec_step_idx"] = spec_step_idx compute_logits_kwargs["spec_step_idx"] = spec_step_idx - with set_forward_context(model_input.attn_metadata, - self.vllm_config): + with set_ascend_forward_context(model_input.attn_metadata, + self.vllm_config): if model_input.attn_metadata is not None: model_input.attn_metadata.input_positions = model_input.input_positions hidden_states = model_executable( input_ids=model_input.input_tokens, + inputs_embeds=None, positions=model_input.input_positions, intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs(multi_modal_kwargs, @@ -293,6 +291,9 @@ def execute_model( ) outputs.append(output) + if self.return_hidden_states and is_fallback: + output.hidden_states = hidden_states + if model_input.attn_metadata.num_prefills == 0 \ and self.indices_of_seq_with_bonus_tokens is not None: assert output.sampled_token_ids is not None diff --git a/vllm_ascend/worker/model_runner.py b/vllm_ascend/worker/model_runner.py index 48c5d4b68f..7846f655d1 100644 --- a/vllm_ascend/worker/model_runner.py +++ b/vllm_ascend/worker/model_runner.py @@ -35,7 +35,6 @@ from vllm.core.scheduler import SchedulerOutputs from vllm.distributed import broadcast_tensor_dict, get_dp_group, get_pp_group from vllm.distributed.kv_transfer import get_kv_transfer_group -from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import logger from vllm.lora.layers import LoRAMapping @@ -66,6 +65,7 @@ _init_sampling_metadata_from_tensor_dict) from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.ascend_forward_context import set_ascend_forward_context if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionBackend @@ -1431,8 +1431,12 @@ def execute_model( model_forward_start.record() if not bypass_model_exec: - with set_forward_context(model_input.attn_metadata, - self.vllm_config, virtual_engine): + with set_ascend_forward_context( + model_input.attn_metadata, + self.vllm_config, + virtual_engine, + with_prefill=prefill_meta is not None, + in_profile_run=self.in_profile_run): if model_input.attn_metadata is not None: model_input.attn_metadata.input_positions = model_input.input_positions if self.torchair_graph_enabled: diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 1dc0a0ebd5..dec73144ed 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -17,7 +17,9 @@ # Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py # +import copy import gc +import math import os import time import types @@ -32,13 +34,18 @@ import torch._dynamo.cache_size import torch.distributed as dist import torch.nn as nn +import torchair from torch.distributed import ReduceOp +from torchair import patch_for_hcom from vllm.attention import AttentionType, get_attn_backend from vllm.attention.layer import Attention from vllm.config import CompilationLevel, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.kv_transfer import (get_kv_transfer_group, + has_kv_transfer_group) +from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 from vllm.distributed.parallel_state import get_dp_group, get_pp_group -from vllm.forward_context import set_forward_context +from vllm.forward_context import get_forward_context from vllm.inputs import INPUT_REGISTRY from vllm.logger import logger from vllm.model_executor.layers.fused_moe import FusedMoE @@ -69,12 +76,15 @@ scatter_mm_placeholders) from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.ascend_forward_context import set_ascend_forward_context from vllm_ascend.attention.attention import AttentionMaskBuilder from vllm_ascend.attention.attention_v1 import AscendAttentionState -from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata +from vllm_ascend.attention.utils import \ + AscendCommonAttentionMetadata as CommonAttentionMetadata +from vllm_ascend.multistream.ms_split import compute_split_seq_index from vllm_ascend.platform import NPUPlatform from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler -from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is +from vllm_ascend.utils import ProfileExecuteDuration from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer from vllm_ascend.eplb.eplb_updator import EplbUpdator @@ -136,6 +146,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config + self.parallel_config = vllm_config.parallel_config self.scheduler_config = vllm_config.scheduler_config self.speculative_config = vllm_config.speculative_config ascend_config = get_ascend_config() @@ -153,12 +164,6 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.max_num_tokens = self.scheduler_config.max_num_batched_tokens self.max_num_reqs = self.scheduler_config.max_num_seqs - self.graph_block_tables = np.zeros( - (self.vllm_config.scheduler_config.max_num_seqs, - (self.model_config.max_model_len + self.block_size - 1) // - self.block_size), - dtype=np.int32) - # Model-related. self.num_attn_layers = self.model_config.get_num_layers_by_block_type( vllm_config.parallel_config, LayerBlockType.attention) @@ -212,8 +217,13 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): # Set up speculative decoding. self.use_spec_decode = False self.spec_attn_mask = None + self.actual_seq_q_lens = [] + self.spec_token_num = 0 + self.decode_token_per_req = 1 if self.speculative_config: self.use_spec_decode = True + self.spec_token_num = self.speculative_config.num_speculative_tokens + assert self.spec_token_num > 0 self.spec_attn_mask = torch.triu(torch.ones(2048, 2048, dtype=torch.bool), @@ -226,6 +236,12 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.device) # type: ignore elif self.speculative_config.method == 'deepseek_mtp': self.drafter = MtpProposer(self.vllm_config, self) + self.decode_token_per_req = 1 + self.spec_token_num + self.actual_seq_q_lens = [ + len for len in + range(self.decode_token_per_req, self.max_num_tokens + + 1, self.decode_token_per_req) + ] else: raise ValueError("Unknown speculative decoding method: " f"{self.speculative_config.method}") @@ -247,6 +263,12 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.seq_lens = torch.zeros(self.max_num_reqs, dtype=torch.int32, device=self.device) + self.slot_mapping = torch.zeros(self.max_num_tokens, + dtype=torch.int32, + device=self.device) + self.query_lens = torch.zeros(self.max_num_reqs, + dtype=torch.int32, + device=self.device) # None in the first PP rank. The rest are set after load_model. self.intermediate_tensors: Optional[IntermediateTensors] = None @@ -347,15 +369,19 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled and self.vllm_config.model_config.use_mla self.use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph self.torchair_graph_batch_sizes = ascend_config.torchair_graph_config.graph_batch_sizes + self.use_ring_mla = ascend_config.chunked_prefill_for_mla if ascend_config.torchair_graph_config.graph_batch_sizes_init: self.init_torchair_graph_batch_sizes() - if len(self.torchair_graph_batch_sizes) == 0: - # TODO(zzzzwwjj): check torchair_graph_batch_sizes init code - self.torchair_graph_batch_sizes = [ - self.scheduler_config.max_num_seqs - ] + self.check_torchair_graph_batch_sizes() + + # graph_block_tables shape: [num_request, cell(max_model_len / block_size)] + self.graph_block_tables = np.zeros( + (self.torchair_graph_batch_sizes[-1] // self.decode_token_per_req, + (self.model_config.max_model_len + self.block_size - 1) // + self.block_size), + dtype=np.int32) torch._dynamo.cache_size.config.cache_size_limit += len( self.torchair_graph_batch_sizes) @@ -373,6 +399,14 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.is_eplb_warmuped = False self.eplb_updator = EplbUpdator(ascend_config.expert_map_path) + # NOTE: we need to use `in_profile_run` to determine whether `enable_force_load_balance` is True + self.in_profile_run = False + + # kv role + self.is_kv_producer = False + if vllm_config.kv_transfer_config is not None: + self.is_kv_producer = vllm_config.kv_transfer_config.is_kv_producer + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler output. @@ -431,33 +465,19 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: generator.manual_seed(sampling_params.seed) else: generator = None - if vllm_version_is("0.9.1"): - self.requests[req_id] = CachedRequestState( - req_id=req_id, - prompt_token_ids=new_req_data.prompt_token_ids, - mm_inputs=new_req_data.mm_inputs, - mm_positions=new_req_data.mm_positions, - sampling_params=sampling_params, - generator=generator, - block_ids=new_req_data.block_ids, - num_computed_tokens=new_req_data.num_computed_tokens, - output_token_ids=[], - lora_request=new_req_data.lora_request, - ) - else: - self.requests[req_id] = CachedRequestState( - req_id=req_id, - prompt_token_ids=new_req_data.prompt_token_ids, - mm_inputs=new_req_data.mm_inputs, - mm_positions=new_req_data.mm_positions, - sampling_params=sampling_params, - pooling_params=None, - generator=generator, - block_ids=new_req_data.block_ids, - num_computed_tokens=new_req_data.num_computed_tokens, - output_token_ids=[], - lora_request=new_req_data.lora_request, - ) + + self.requests[req_id] = CachedRequestState( + req_id=req_id, + prompt_token_ids=new_req_data.prompt_token_ids, + mm_inputs=new_req_data.mm_inputs, + mm_positions=new_req_data.mm_positions, + sampling_params=sampling_params, + generator=generator, + block_ids=new_req_data.block_ids, + num_computed_tokens=new_req_data.num_computed_tokens, + output_token_ids=[], + lora_request=new_req_data.lora_request, + ) # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: @@ -580,6 +600,16 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Append to the end. req_index = None self.input_batch.add_request(req_state, req_index) + spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get( + req_id, ()) + if spec_token_ids: + req_index = self.input_batch.num_reqs - 1 + start_index = len(req_state.prompt_token_ids) + len( + req_state.output_token_ids) + end_token_index = start_index + len(spec_token_ids) + self.input_batch.token_ids_cpu[ + req_index, start_index:end_token_index] = spec_token_ids + self.input_batch.num_tokens[req_index] = end_token_index # Condense the batched states if there are empty indices. if removed_req_indices: @@ -589,16 +619,45 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: self.input_batch.refresh_sampling_metadata() def _get_forward_metadata_across_dp( - self, total_num_scheduled_tokens: int, - with_prefill: bool) -> tuple[int, bool]: + self, num_tokens: int, with_prefill: bool, enable_dbo: bool + ) -> tuple[int, Optional[torch.Tensor], bool, bool]: + if self.dp_size == 1: + return num_tokens, None, with_prefill, enable_dbo + forward_metadata = torch.tensor( - [total_num_scheduled_tokens, with_prefill], + [num_tokens, with_prefill, not enable_dbo], device="cpu", dtype=torch.int32) dist.all_reduce(forward_metadata, op=ReduceOp.MAX, group=get_dp_group().cpu_group) - return int(forward_metadata[0]), bool(forward_metadata[1] > 0) + num_tokens_across_dp = torch.tensor([forward_metadata[0]] * + self.dp_size, + device="cpu", + dtype=torch.int32) + return forward_metadata[0].item(), num_tokens_across_dp, bool( + forward_metadata[1]), not bool(forward_metadata[2]) + + def _check_dbo_is_valid(self, query_lens: torch.Tensor, + attn_state: AscendAttentionState, + num_tokens: int) -> bool: + # do the checks for dp + dbo + if attn_state in [ + AscendAttentionState.DecodeOnly, + AscendAttentionState.SpecDecoding + ]: + return False + # considering the case that one dp rank may enable dbo while others may not + if not self.vllm_config.model_config.use_mla or not envs_ascend.VLLM_ASCEND_ENABLE_DBO: + return False + # TODO: remove it if token-level microbatch is enabled + [token_index, + seq_index] = compute_split_seq_index(query_lens, attn_state, + num_tokens) + if token_index == 0 or seq_index == 0 or seq_index == len( + query_lens) or num_tokens < 256: + return False + return True def get_model(self) -> nn.Module: return self.model @@ -787,7 +846,8 @@ def _process_reqs( scheduler_output: "SchedulerOutput", intermediate_tensors: Optional[IntermediateTensors] = None, ) -> tuple[SpecDecodeMetadata, torch.Tensor, SpecDecodeMetadata, - torch.Tensor, int, torch.Tensor]: + torch.Tensor, int, torch.Tensor, Optional[set[str]], + Optional[set[str]]]: # Check input valid total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens assert total_num_scheduled_tokens > 0 @@ -855,6 +915,7 @@ def _process_reqs( self.mrope_positions_cpu[:, :total_num_scheduled_tokens], non_blocking=True) + self.positions[total_num_scheduled_tokens:num_input_tokens].zero_() self.positions[:total_num_scheduled_tokens].copy_( self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True) positions = self.positions[:num_input_tokens] @@ -883,6 +944,9 @@ def _process_reqs( # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache. elif np.all(num_scheduled_tokens == 1): attn_state = AscendAttentionState.DecodeOnly + if self.speculative_config and self.speculative_config.method == 'deepseek_mtp': + # SpecDecoding now supports seq_len=1 and seq_len=2 + attn_state = AscendAttentionState.SpecDecoding # Speculative decoding. elif np.all(num_valid_tokens == 1): attn_state = AscendAttentionState.SpecDecoding @@ -892,11 +956,14 @@ def _process_reqs( else: attn_state = AscendAttentionState.PrefillCacheHit - attn_mask = self._make_attention_mask(seq_lens=seq_lens, - query_lens=num_scheduled_tokens, - position=positions, - attn_state=attn_state) - self.attn_mask = attn_mask + # NOTE: when use ring_mla, attn_mask don't need to generate here. + if not self.use_ring_mla or attn_state == AscendAttentionState.PrefillNoCache: + attn_mask = self._make_attention_mask( + seq_lens=seq_lens, + query_lens=num_scheduled_tokens, + position=positions, + attn_state=attn_state) + self.attn_mask = attn_mask self.attn_state = attn_state # type: ignore extra_builder_kwargs = {} @@ -907,36 +974,52 @@ def _process_reqs( self.query_start_loc_cpu[:num_reqs + 1], non_blocking=True) self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs], non_blocking=True) + self.slot_mapping[:total_num_scheduled_tokens].copy_( + self.slot_mapping_cpu[:total_num_scheduled_tokens], + non_blocking=True) # Fill unused with -1. Needed for reshape_and_cache + self.slot_mapping[total_num_scheduled_tokens:].fill_(-1) self.seq_lens[num_reqs:].fill_(0) self.query_start_loc[num_reqs + 1:].fill_(-1) query_start_loc = self.query_start_loc[:num_reqs + 1] - seq_lens = self.seq_lens[:num_reqs] + # Use host tensor, other wise error: tensor.hostData is null common_attn_metadata = CommonAttentionMetadata( - query_start_loc=query_start_loc, seq_lens=seq_lens) + query_start_loc=query_start_loc, + seq_lens=self.seq_lens_cpu[:num_reqs]) + self.seq_lens_list = self.seq_lens_np.tolist()[:num_input_tokens] with_prefill = attn_state not in [ AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding ] + enable_dbo = self._check_dbo_is_valid(self.query_lens.tolist(), + attn_state, + total_num_scheduled_tokens) + num_tokens_across_dp = None - if self.dp_size > 1: - max_num_tokens, with_prefill = self._get_forward_metadata_across_dp( - total_num_scheduled_tokens, with_prefill) - extra_builder_kwargs['max_num_tokens_across_dp'] = max_num_tokens - extra_builder_kwargs['with_prefill_across_dp'] = with_prefill - - # Add graph_pad_size here + padded_num_tokens = total_num_scheduled_tokens if self.torchair_graph_enabled and not with_prefill: - if self.dp_size > 1: - padded_batch_size = self.select_torchair_padded_batch_size( - max_num_tokens) - else: - padded_batch_size = self.select_torchair_padded_batch_size( - total_num_scheduled_tokens) - graph_pad_size = padded_batch_size - total_num_scheduled_tokens + padded_num_tokens = self.select_torchair_padded_batch_size( + total_num_scheduled_tokens) + (padded_num_tokens_across_dp, num_tokens_across_dp, with_prefill, + enable_dbo) = self._get_forward_metadata_across_dp( + padded_num_tokens, with_prefill, enable_dbo) + extra_builder_kwargs['enable_dbo_across_dp'] = enable_dbo + + # TODO(zzzzwwjj): this code need to refactor afterwards. + self.with_prefill = with_prefill + # Add num_token_pad_size and num_reqs_pad_size here for torchair graph mode + if self.torchair_graph_enabled and not with_prefill: + num_token_pad_size = padded_num_tokens_across_dp - total_num_scheduled_tokens + num_reqs_pad_size = ( + padded_num_tokens_across_dp // self.decode_token_per_req - + num_reqs) + assert num_token_pad_size >= 0 and num_reqs_pad_size >= 0 - extra_builder_kwargs['graph_pad_size'] = graph_pad_size + extra_builder_kwargs['num_token_pad_size'] = num_token_pad_size + extra_builder_kwargs['num_reqs_pad_size'] = num_reqs_pad_size + self.num_reqs_pad_size = num_reqs_pad_size + self.extra_builder_kwargs = extra_builder_kwargs if self.vllm_config.model_config.use_mla: attn_metadata = self.attn_metadata_builder.build( # type: ignore @@ -952,6 +1035,7 @@ def _process_reqs( num_reqs=num_reqs, num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, + common_attn_metadata=common_attn_metadata, common_prefix_len=None, **extra_builder_kwargs, ) @@ -967,10 +1051,7 @@ def _process_reqs( # Copy the tensors to the NPU. self.input_ids[:total_num_scheduled_tokens].copy_( self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) - input_ids = self.input_ids[:num_input_tokens] - # prepare the MRoPE for mllm if using multimodal - num_input_tokens = total_num_scheduled_tokens # _prepare_inputs may reorder the batch, so we must gather multi # modal outputs after that to ensure the correct order if self.is_multimodal_model: @@ -984,51 +1065,56 @@ def _process_reqs( # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. - input_ids = self.input_ids[:num_input_tokens] + input_ids = self.input_ids[:total_num_scheduled_tokens] if mm_embeds: inputs_embeds = self.model.get_input_embeddings( input_ids, mm_embeds) else: inputs_embeds = self.model.get_input_embeddings(input_ids) # TODO(woosuk): Avoid the copy. Optimize. - self.inputs_embeds[:num_input_tokens].copy_(inputs_embeds) + self.inputs_embeds[:total_num_scheduled_tokens].copy_( + inputs_embeds) inputs_embeds = self.inputs_embeds[:num_input_tokens] input_ids = None else: # For text-only models, we use token ids as input. # While it is possible to use embeddings as input just like the # multimodal models, it is not desirable for performance since - # then the embedding layer is not included in the CUDA graph. + # then the embedding layer is not included in the ACL Graph. input_ids = self.input_ids[:num_input_tokens] inputs_embeds = None if self.uses_mrope: positions = self.mrope_positions[:, :num_input_tokens] - else: - positions = self.positions[:num_input_tokens] if self.torchair_graph_enabled and not with_prefill: - input_ids = self.input_ids[:padded_batch_size] - positions = self.positions[:padded_batch_size] + input_ids = self.input_ids[:padded_num_tokens_across_dp] + positions = self.positions[:padded_num_tokens_across_dp] # Run forward pass - with set_forward_context(attn_metadata, - self.vllm_config, - num_tokens=num_input_tokens): + # TODO(zzzzwwjj): check param `num_tokens_across_dp` later. + with set_ascend_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=padded_num_tokens_across_dp, + num_tokens_across_dp=num_tokens_across_dp, + with_prefill=with_prefill): with ProfileExecuteDuration().capture_async("forward"): + self.maybe_setup_kv_connector(scheduler_output) model_kwargs = {} if self.torchair_graph_enabled: model_kwargs["kv_caches"] = self.kv_caches model_kwargs["attn_metadata"] = attn_metadata + if envs_ascend.VLLM_ASCEND_ENABLE_DBO and with_prefill: + model_kwargs["graph_enable"] = False # type: ignore if self.torchair_graph_enabled and not with_prefill: compiled_model = self._get_torchair_lazy_compiled_model( - padded_batch_size) + padded_num_tokens_across_dp) hidden_states = compiled_model( input_ids=input_ids, positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, - **model_kwargs, - ) + **model_kwargs) else: assert self.model is not None hidden_states = self.model( @@ -1036,9 +1122,11 @@ def _process_reqs( positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, - **model_kwargs, - ) + **model_kwargs) + self.maybe_wait_for_kv_save() + finished_sending, finished_recving = self.get_finished_kv_transfer( + scheduler_output) use_spec_decode = len( scheduler_output.scheduled_spec_decode_tokens) > 0 if not use_spec_decode: @@ -1063,7 +1151,8 @@ def _process_reqs( sample_indices = spec_decode_metadata.logits_indices return (attn_metadata, hidden_states, spec_decode_metadata, positions, - total_num_scheduled_tokens, sample_indices) + total_num_scheduled_tokens, sample_indices, finished_sending, + finished_recving) def _calc_spec_decode_metadata( self, @@ -1230,21 +1319,25 @@ def execute_model( scheduler_output: "SchedulerOutput", intermediate_tensors: Optional[IntermediateTensors] = None, ) -> Union[ModelRunnerOutput, torch.Tensor]: - with ProfileExecuteDuration().capture_async( "prepare input and forward"): self._update_states(scheduler_output) if not scheduler_output.total_num_scheduled_tokens: - # Return empty ModelRunnerOuptut if there's no work to do. - return EMPTY_MODEL_RUNNER_OUTPUT + if not has_kv_transfer_group(): + logger.debug( + "skip this step for we receive the data from remote disaggregate prefill node" + ) + # Return empty ModelRunnerOuptut if there's no work to do. + return EMPTY_MODEL_RUNNER_OUTPUT + return self.kv_connector_no_forward(scheduler_output) if self.dynamic_eplb: self.eplb_updator.forward_before() (attn_metadata, hidden_states, spec_decode_metadata, positions, - num_scheduled_tokens, - sample_indices) = (self._process_reqs(scheduler_output, - intermediate_tensors)) + num_scheduled_tokens, sample_indices, finished_sending, + finished_recving) = (self._process_reqs(scheduler_output, + intermediate_tensors)) with ProfileExecuteDuration().capture_async("post process"): logits = self.model.compute_logits(hidden_states[sample_indices], @@ -1335,25 +1428,19 @@ def execute_model( hidden_states, attn_metadata, ) - if vllm_version_is("0.9.1"): - model_runner_output = ModelRunnerOutput( - req_ids=self.input_batch.req_ids, - req_id_to_index=self.input_batch.req_id_to_index, - sampled_token_ids=valid_sampled_token_ids, - spec_token_ids=spec_token_ids, - logprobs=logprobs_lists, - prompt_logprobs_dict={}, - ) - else: - model_runner_output = ModelRunnerOutput( - req_ids=self.input_batch.req_ids, - req_id_to_index=self.input_batch.req_id_to_index, - sampled_token_ids=valid_sampled_token_ids, - spec_token_ids=spec_token_ids, - logprobs=logprobs_lists, - prompt_logprobs_dict={}, - pooler_output=[], - ) + if has_kv_transfer_group(): + get_kv_transfer_group().clear_connector_metadata() + + model_runner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=valid_sampled_token_ids, + spec_token_ids=spec_token_ids, + logprobs=logprobs_lists, + prompt_logprobs_dict={}, + finished_sending=finished_sending, + finished_recving=finished_recving, + ) durations = ProfileExecuteDuration().pop_captured_sync() if durations: @@ -1370,6 +1457,50 @@ def execute_model( return model_runner_output + def kv_connector_no_forward( + self, scheduler_output: "SchedulerOutput") -> ModelRunnerOutput: + # TODO(zzzzwwjj): Check whether `set_ascend_forward_context` has influence with kv_connector or not. + with set_ascend_forward_context(None, self.vllm_config): + self.maybe_setup_kv_connector(scheduler_output) + finsihed_sending, finished_recving = ( + self.get_finished_kv_transfer(scheduler_output)) + # For the case of no forward caused by receiving remote kv, + # one round of dummy inference is necessary + # to prevent hang over the collective calls. + if not finsihed_sending and not finished_recving: + return EMPTY_MODEL_RUNNER_OUTPUT + + output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) + output.finished_sending = finsihed_sending + output.finished_recving = finished_recving + return output + + @staticmethod + def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"): + # Update KVConnector with the KVConnector metadata forward(). + if has_kv_transfer_group(): + kv_connector = get_kv_transfer_group() + assert isinstance(kv_connector, KVConnectorBase_V1) + assert scheduler_output.kv_connector_metadata is not None + kv_connector.bind_connector_metadata( + scheduler_output.kv_connector_metadata) + + kv_connector.start_load_kv(get_forward_context()) + + @staticmethod + def maybe_wait_for_kv_save() -> None: + if has_kv_transfer_group(): + get_kv_transfer_group().wait_for_save() + + @staticmethod + def get_finished_kv_transfer( + scheduler_output: "SchedulerOutput", + ) -> tuple[Optional[set[str]], Optional[set[str]]]: + if has_kv_transfer_group(): + return get_kv_transfer_group().get_finished( + scheduler_output.finished_req_ids) + return None, None + def _profile_multimodal(self) -> None: # TODO: handle encoder-decoder models once we support them. # NOTE: Currently model is profiled with a single non-text @@ -1457,16 +1588,34 @@ def _profile_multimodal(self) -> None: def _dummy_run( self, num_tokens: int, - is_compile: bool = False, - with_prefill: bool = True, + skip_attn: bool = True, + with_prefill: bool = False, + is_torchair_compile: bool = False, + attn_state: AscendAttentionState = AscendAttentionState.DecodeOnly, is_profile_run: bool = False, ) -> torch.Tensor: + if self.torchair_graph_enabled and not with_prefill: + num_tokens = self.select_torchair_padded_batch_size(num_tokens) + + # For kv producer, with prefill always true + if self.is_kv_producer: + with_prefill = True + # Padding for DP + (num_tokens, num_tokens_across_dp, with_prefill, + enable_dbo) = self._get_forward_metadata_across_dp( + num_tokens, with_prefill, False) + # Set num_scheduled_tokens based on num_tokens and max_num_seqs # for dummy run with LoRA so that the num_reqs collectively # has num_tokens in total. assert num_tokens <= self.scheduler_config.max_num_batched_tokens max_num_reqs = self.scheduler_config.max_num_seqs - num_reqs = max_num_reqs if num_tokens >= max_num_reqs else num_tokens + num_reqs = math.ceil(num_tokens / self.decode_token_per_req) + if with_prefill: + num_reqs = min(num_tokens, max_num_reqs) + else: + num_reqs = (num_tokens + self.decode_token_per_req - + 1) // self.decode_token_per_req min_tokens_per_req = num_tokens // num_reqs num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs num_scheduled_tokens_list[-1] += num_tokens % num_reqs @@ -1475,7 +1624,23 @@ def _dummy_run( num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32) - if not is_compile and not is_profile_run and self.dynamic_eplb: + # NOTE: If torchair graph mode and not with_prefill, + # we can't skip_attn, it will cause graph recompile. + if self.torchair_graph_enabled and not with_prefill: + attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy( + num_reqs=num_reqs, num_actual_tokens=1) + elif skip_attn: + attn_metadata = None + else: + attn_metadata = self.attn_metadata_builder.build_dummy_metadata( + num_actual_tokens=num_tokens, + num_reqs=num_reqs, + num_scheduled_tokens=num_scheduled_tokens, + attn_state=attn_state, + ) + + + if not is_torchair_compile and not is_profile_run and self.dynamic_eplb: self.eplb_updator.forward_before() with self.maybe_dummy_run_with_lora(self.lora_config, @@ -1507,14 +1672,17 @@ def _dummy_run( for k, v in self.intermediate_tensors.items() }) - with set_forward_context(None, - self.vllm_config, - num_tokens=num_tokens): + with set_ascend_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + with_prefill=with_prefill, + in_profile_run=self.in_profile_run): + model_kwargs = {} if self.torchair_graph_enabled and not with_prefill: - attn_metadata = self.attn_metadata_builder.build_dummy( - num_reqs=num_tokens, num_actual_tokens=1) # Only mark static while compiling - if is_compile: + if is_torchair_compile: torch._dynamo.mark_static(input_ids) torch._dynamo.mark_static(positions) torch._dynamo.mark_static( @@ -1529,26 +1697,42 @@ def _dummy_run( torch._dynamo.mark_static(kv[1]) compiled_model = self._get_torchair_lazy_compiled_model( num_tokens) + model_kwargs["kv_caches"] = self.kv_caches + model_kwargs["attn_metadata"] = attn_metadata + if envs_ascend.VLLM_ASCEND_ENABLE_DBO: + model_kwargs["graph_enable"] = True # type: ignore hidden_states = compiled_model( input_ids=input_ids, positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=None, - kv_caches=self.kv_caches, - attn_metadata=attn_metadata, + **model_kwargs, ) else: + if envs_ascend.VLLM_ASCEND_ENABLE_DBO: + model_kwargs["graph_enable"] = False # type: ignore hidden_states = model( input_ids=input_ids, positions=positions, intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds) - - if is_profile_run and self.dynamic_eplb: - self.model.clear_all_moe_loads() - if not is_compile and not is_profile_run and self.dynamic_eplb: - self.eplb_updator.forward_end() - return hidden_states + inputs_embeds=inputs_embeds, + **model_kwargs) + if self.speculative_config and self.speculative_config.method == "deepseek_mtp": + assert isinstance(self.drafter, MtpProposer) + self.drafter.dummy_run(num_reqs, with_prefill=with_prefill) + if is_profile_run and self.dynamic_eplb: + self.model.clear_all_moe_loads() + if not is_torchair_compile and not is_profile_run and self.dynamic_eplb: + self.eplb_updator.forward_end() + return hidden_states + + @contextmanager + def set_in_profile_run(self): + self.in_profile_run = True + try: + yield + finally: + self.in_profile_run = False def profile_run(self) -> None: # FIXME Profile with multimodal encoder & encoder cache. @@ -1576,7 +1760,10 @@ def profile_run(self) -> None: # TODO: call maybe_profile_with_lora() # Trigger compilation for general shape. - hidden_states = self._dummy_run(num_tokens = self.max_num_tokens, is_profile_run=True) + with self.set_in_profile_run(): + hidden_states = self._dummy_run(self.max_num_tokens, + with_prefill=True, + is_profile_run=True) if get_pp_group().is_last_rank: hidden_states = hidden_states[logit_indices] @@ -1621,9 +1808,9 @@ def load_model(self) -> None: m.consumed_memory / float(2**30)) def _get_torchair_lazy_compiled_model(self, batch_size: int): - if batch_size < 0 or batch_size > self.max_num_reqs: + if batch_size < 0 or batch_size > self.torchair_graph_batch_sizes[-1]: raise ValueError( - f"Bad graph batch size:{batch_size}! max_num_reqs:{self.max_num_reqs}" + f"Bad graph batch size:{batch_size}! max_graph_batch_sizes:{self.torchair_graph_batch_sizes[-1]}" ) compiled_model = self.torchair_compiled_models.get( @@ -1633,9 +1820,6 @@ def _get_torchair_lazy_compiled_model(self, batch_size: int): if compiled_model: return compiled_model - import torchair # type: ignore - from torchair import patch_for_hcom # type: ignore - patch_for_hcom() config = torchair.CompilerConfig() config.experimental_config.frozen_parameter = True @@ -1683,9 +1867,14 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: kv_cache_config: Configuration for the KV cache, including the KV cache size of each layer """ - import torch_npu kv_caches: Dict[str, torch.Tensor] = {} + def align_memory(tensor: torch.Tensor, alignment: int) -> torch.Tensor: + data_ptr = tensor.data_ptr() + aligned_addr = (data_ptr + alignment - 1) // alignment * alignment + offset = (aligned_addr - data_ptr) // tensor.element_size() + return tensor[int(offset):] + self.input_batch = InputBatch( max_num_reqs=self.max_num_reqs, max_model_len=self.model_config.max_model_len, @@ -1718,6 +1907,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: # different GPUs, and `kv_cache_config.num_blocks` is set to # the min of all `num_blocks`. Verify it here. assert num_blocks >= kv_cache_config.num_blocks + alignment = 2 * 1024 * 1024 # TODO: remove this after the OOM issue is located and fixed, otherwise, some model may # encounter OOM issue if isinstance(kv_cache_spec, FullAttentionSpec): @@ -1725,29 +1915,51 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) dtype = kv_cache_spec.dtype - if self.torchair_graph_enabled: - layer_kv_cache_nope = torch.zeros( - kv_cache_shape[:-1] + - (self.model_config.hf_text_config.kv_lora_rank, ), - dtype=self.dtype, - pin_memory=True, - device=self.device) - layer_kv_cache_pe = torch.zeros( - kv_cache_shape[:-1] + - (self.model_config.hf_text_config.qk_rope_head_dim, - ), - dtype=self.dtype, - pin_memory=True, - device=self.device) - kv_caches[layer_name] = (layer_kv_cache_nope, - layer_kv_cache_pe) - torch_npu.npu_format_cast(kv_caches[layer_name][0], 2) - torch_npu.npu_format_cast(kv_caches[layer_name][1], 2) + if self.model_config.is_deepseek_mla: + # In order to transfer kv cache through the reigster_memory api from llmdatadist, the memory + # address should be aligned by 2M. In most case, torch_npu can allocate 2M aligned memory, but + # we found there are also some exceptions during test, so we manual align those memory here, this part + # of code may consume 2M * 2 * elem_size memory every layer. + num_blocks, block_size, num_kv_heads, head_size = kv_cache_shape + rope_dim = self.model_config.hf_text_config.qk_rope_head_dim + nope_dim = head_size - rope_dim + nope_allocate_shape = num_blocks * block_size * num_kv_heads * nope_dim + nope_allocate_shape_alignment = nope_allocate_shape + alignment + rope_allocate_shape = num_blocks * block_size * num_kv_heads * rope_dim + rope_allocate_shape_alignment = rope_allocate_shape + alignment + nope_cache_shape = (num_blocks, block_size, + num_kv_heads, nope_dim) + rope_cache_shape = (num_blocks, block_size, + num_kv_heads, rope_dim) + nope_cache = torch.zeros(nope_allocate_shape_alignment, + dtype=dtype, + device=self.device) + rope_cache = torch.zeros(rope_allocate_shape_alignment, + dtype=dtype, + device=self.device) + nope_cache = align_memory( + nope_cache, alignment)[:nope_allocate_shape].view( + nope_cache_shape) + rope_cache = align_memory( + rope_cache, alignment)[:rope_allocate_shape].view( + rope_cache_shape) + kv_caches[layer_name] = (nope_cache, rope_cache) else: - kv_caches[layer_name] = torch.zeros(kv_cache_shape, - dtype=dtype, - device=self.device) - torch_npu.npu_format_cast(kv_caches[layer_name], 2) + num_caches = kv_cache_shape[0] + kv_cache_list = [] + for i in range(num_caches): + cache_shape = kv_cache_shape[1:] + cache_size = math.prod(cache_shape) + cache_size_aligned = cache_size + alignment + kv_cache = torch.zeros(cache_size_aligned, + dtype=dtype, + device=self.device) + kv_cache = align_memory( + kv_cache, + alignment)[:cache_size].view(cache_shape) + kv_cache_list.append(kv_cache) + kv_caches[layer_name] = kv_cache_list + # torch_npu.npu_format_cast(kv_caches[layer_name], 2) else: # TODO: add new branches when introducing more types of # KV cache specs. @@ -1758,6 +1970,9 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: self.vllm_config.compilation_config.static_forward_context, self.kv_caches) + if has_kv_transfer_group(): + get_kv_transfer_group().register_kv_caches(kv_caches) + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """ Generates the KVCacheSpec by parsing the kv cache format from each @@ -1816,24 +2031,25 @@ def capture_model(self) -> None: reversed(torchair_graph_batch_sizes)): for _ in range(self.vllm_config.compilation_config. cudagraph_num_of_warmups): - self._dummy_run(num_tokens, - is_compile=True, - with_prefill=False) - self._dummy_run(num_tokens, - is_compile=True, - with_prefill=False) + # NOTE: when in torchair graph and not with_prefill, + # we don't need to set `skip_attn=False` + self._dummy_run(num_tokens, is_torchair_compile=True) + self._dummy_run(num_tokens, is_torchair_compile=True) logger.info("Batchsize %d is compiled successfully: %d/%d.", num_tokens, idx + 1, graph_num) elif self.use_aclgraph: # Trigger ACL graph capture for specific shapes. # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. + # TODO(zzzzwwjj): Check dummy_run with ACL Graph and full graph mode with graph_capture(device=self.device): + skip_attn = not self.vllm_config.compilation_config.full_cuda_graph + # TODO: Make sure passing attn_state to _dummy_run in the future for num_tokens in reversed(self.aclgraph_batch_sizes): for _ in range(self.vllm_config.compilation_config. cudagraph_num_of_warmups): - self._dummy_run(num_tokens) - self._dummy_run(num_tokens) + self._dummy_run(num_tokens, skip_attn=skip_attn) + self._dummy_run(num_tokens, skip_attn=skip_attn) else: logger.info("Skipping NPU graph capture for eager mode.") return @@ -1928,6 +2144,7 @@ def _generate_mtp_token_ids( cu_num_tokens, token_indices = self.drafter.prepare_inputs( attn_metadata.query_start_loc, num_rejected_tokens, + force_one_token=True, ) target_token_ids = self.input_ids[token_indices] target_positions = positions[token_indices] @@ -1959,8 +2176,43 @@ def init_torchair_graph_batch_sizes(self): start_graph_batch_size *= 2 def select_torchair_padded_batch_size(self, batch_size: int): - selected_batch_size = self.max_num_reqs for padded_batch_size in self.torchair_graph_batch_sizes: - if batch_size <= padded_batch_size < selected_batch_size: - selected_batch_size = padded_batch_size - return selected_batch_size + if batch_size <= padded_batch_size: + # we treat batch_size as num of requests + return padded_batch_size + raise ValueError( + f"cur batch_size is invalid, torchair_graph_batch_sizes is " + f"{self.torchair_graph_batch_sizes}, but cur batch_size is {batch_size}." + ) + + def check_torchair_graph_batch_sizes(self): + # return graph_batch_sizes according to the number of tokens + # first pad according to the number of requests + if len(self.torchair_graph_batch_sizes) == 0: + self.torchair_graph_batch_sizes = [1, self.max_num_reqs] + else: + self.torchair_graph_batch_sizes = sorted( + self.torchair_graph_batch_sizes) + while self.torchair_graph_batch_sizes[-1] > self.max_num_reqs: + self.torchair_graph_batch_sizes.pop() + if len(self.torchair_graph_batch_sizes) == 0: + logger.warning( + "torch_graph_batch_sizes is invalid, reset it to [1, max_num_seqs]" + ) + self.torchair_graph_batch_sizes = [1, self.max_num_reqs] + if self.torchair_graph_batch_sizes[-1] < self.max_num_reqs: + self.torchair_graph_batch_sizes.append(self.max_num_reqs) + + # NOTE: when enable_expert_parallel, we need to check if `graph_batch_size` is divisible by `tp_size` + tp_size = self.parallel_config.tensor_parallel_size + if self.parallel_config.enable_expert_parallel: + new_graph_batch_sizes = [] + for graph_batch_size in self.torchair_graph_batch_sizes: + cur_graph_batch_size = (graph_batch_size + tp_size - + 1) // tp_size * tp_size + # `graph_batch_size` need to be divisible by `self.decode_token_per_req` + cur_graph_batch_size = cur_graph_batch_size * self.decode_token_per_req + if cur_graph_batch_size not in new_graph_batch_sizes and \ + cur_graph_batch_size <= self.scheduler_config.max_num_batched_tokens: + new_graph_batch_sizes.append(cur_graph_batch_size) + self.torchair_graph_batch_sizes = new_graph_batch_sizes diff --git a/vllm_ascend/worker/mtp_proposer_v1.py b/vllm_ascend/worker/mtp_proposer_v1.py index ba8406fa0a..04a7d617b5 100644 --- a/vllm_ascend/worker/mtp_proposer_v1.py +++ b/vllm_ascend/worker/mtp_proposer_v1.py @@ -1,15 +1,19 @@ import torch +import vllm.envs as envs_vllm from vllm.attention.layer import Attention from vllm.config import (VllmConfig, get_layers_from_vllm_config, set_current_vllm_config) -from vllm.forward_context import set_forward_context from vllm.model_executor.model_loader import get_model_loader from vllm.model_executor.model_loader.utils import ( process_weights_after_loading, set_default_torch_dtype) from vllm.v1.sample.metadata import SamplingMetadata -from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata +from vllm_ascend.ascend_forward_context import set_ascend_forward_context +from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.attention.utils import \ + AscendCommonAttentionMetadata as CommonAttentionMetadata from vllm_ascend.models.deepseek_mtp import CustomDeepSeekMTP +from vllm_ascend.utils import ProfileExecuteDuration # FIXME(woosuk): The logic here is duplicated with the main sampling code. @@ -61,13 +65,26 @@ def __init__( vllm_config.speculative_config.num_speculative_tokens) self.block_size = vllm_config.cache_config.block_size self.runner = runner + # persistent buffers for graph + self.input_ids = torch.zeros(self.runner.max_num_tokens, + dtype=torch.int32, + device=self.runner.device) + self.positions = torch.zeros(self.runner.max_num_tokens, + dtype=torch.int64, + device=self.runner.device) + self.hidden_states = torch.zeros( + (self.runner.max_num_tokens, self.runner.hidden_size), + dtype=self.runner.dtype, + device=self.runner.device) + self.is_mtp_torchair_ready = False @staticmethod def prepare_inputs( - # [batch_size + 1] - cu_target_query_lens: torch.Tensor, - # [batch_size] - num_rejected_tokens: torch.Tensor, + # [batch_size + 1] + cu_target_query_lens: torch.Tensor, + # [batch_size] + num_rejected_tokens: torch.Tensor, + force_one_token: bool = False ) -> tuple[torch.Tensor, torch.Tensor]: # cu_target_query_lens: [0, a, a + b, a + b + c] # num_rejected_tokens: [n1, n2, n3] @@ -76,32 +93,39 @@ def prepare_inputs( # token_indices: [0, 1, ..., a - n1 - 1, # a, a + 1, ..., a + b - n2 - 1, # a + b, a + b + 1, ..., a + b + c - n3 - 1] - # [0, a, a + b, a + b + c] -> [a, b, c] query_len_per_req = (cu_target_query_lens[1:] - cu_target_query_lens[:-1]) # [a, b, c] -> [a - n1, b - n2, c - n3] num_tokens_per_req = query_len_per_req - num_rejected_tokens + if force_one_token: + # enable force_one_token means we only focus on the last token position of each request + # token_indices: [batch_size] + cu_num_tokens = torch.arange(cu_target_query_lens.size(0), + device=cu_target_query_lens.device, + dtype=torch.int32) + relative_index = query_len_per_req - num_rejected_tokens - 1 + token_indices = cu_target_query_lens[:-1] + relative_index + else: + cu_num_tokens = torch.empty_like(cu_target_query_lens) + torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:]) + cu_num_tokens[0] = 0 + + # FIXME(woosuk): Avoid synchronization. + num_tokens = cu_num_tokens[-1].item() + token_indices = torch.empty( + num_tokens, + dtype=torch.int32, + device=cu_num_tokens.device, + ) - cu_num_tokens = torch.empty_like(cu_target_query_lens) - torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:]) - cu_num_tokens[0] = 0 - - # FIXME(woosuk): Avoid synchronization. - num_tokens = cu_num_tokens[-1].item() - token_indices = torch.empty( - num_tokens, - dtype=torch.int32, - device=cu_num_tokens.device, - ) - - BLOCK_SIZE = 1024 - prepare_input_kernel( - token_indices, - cu_target_query_lens, - cu_num_tokens, - block_size=BLOCK_SIZE, - ) + BLOCK_SIZE = 1024 + prepare_input_kernel( + token_indices, + cu_target_query_lens, + cu_num_tokens, + block_size=BLOCK_SIZE, + ) return cu_num_tokens, token_indices def propose( @@ -126,13 +150,12 @@ def propose( batch_size = next_token_ids.shape[0] last_token_indices = cu_num_tokens[1:] - 1 - input_ids = torch.empty_like(target_token_ids) # Shift the input ids by one token. # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3] - input_ids[:-1] = target_token_ids[1:] + self.input_ids[:num_tokens - 1] = target_token_ids[1:] # Replace the last token with the next token. # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4] - input_ids[last_token_indices] = next_token_ids + self.input_ids[last_token_indices] = next_token_ids query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1] max_query_len = query_lens.max().item() @@ -152,6 +175,23 @@ def propose( # input_batch=self.runner.input_batch, # scheduler_output=self.runner.scheduler_output, # ) + extra_builder_kwargs = self.runner.extra_builder_kwargs + + is_running_torchair = self.runner.torchair_graph_enabled and \ + not self.runner.with_prefill and self.is_mtp_torchair_ready + + if is_running_torchair: + if num_tokens == 1: + self.runner.attn_state = AscendAttentionState.DecodeOnly + num_reqs_pad_size = self.runner.num_reqs_pad_size + extra_builder_kwargs['num_reqs_pad_size'] = num_reqs_pad_size + # Assume num token per request is one + extra_builder_kwargs['num_token_pad_size'] = num_reqs_pad_size + num_input_tokens = self.runner.num_reqs_pad_size + else: + extra_builder_kwargs['num_token_pad_size'] = -1 + extra_builder_kwargs['num_reqs_pad_size'] = 0 + num_input_tokens = num_tokens attn_metadata = self.runner.attn_metadata_builder.build( num_reqs=batch_size, @@ -159,14 +199,52 @@ def propose( max_query_len=max_query_len, common_prefix_len=0, common_attn_metadata=common_attn_metadata, - ) - - with set_forward_context(attn_metadata, self.vllm_config): - hidden_states = self.model( - input_ids=input_ids, - positions=target_positions, - previous_hidden_states=target_hidden_states, - ) + is_mtp_model=True, + **extra_builder_kwargs) + + self.positions[:num_tokens] = target_positions + self.hidden_states[:num_tokens] = target_hidden_states + + # Assuming force_one_token is on, so each perfill request query_lens is 1 + if attn_metadata.prefill is not None: + attn_metadata.prefill.query_lens[:] = 1 + + with set_ascend_forward_context(attn_metadata, + self.vllm_config, + num_tokens=num_input_tokens): + with ProfileExecuteDuration().capture_async('mtp_forward'): + model_kwargs = {} + model_kwargs["attn_metadata"] = attn_metadata + if self.runner.torchair_graph_enabled: + model_kwargs["kv_caches"] = self.runner.kv_caches[-1:] + if is_running_torchair: + torch._dynamo.mark_static(self.input_ids) + torch._dynamo.mark_static(self.positions) + torch._dynamo.mark_static(attn_metadata.decode.block_table) + torch._dynamo.mark_static( + attn_metadata.decode.input_positions) + torch._dynamo.mark_static(attn_metadata.slot_mapping) + torch._dynamo.mark_static(attn_metadata.decode.attn_mask) + for kv in self.runner.kv_caches: + assert isinstance(kv, + tuple), "kv_cache must be a tuple" + torch._dynamo.mark_static(kv[0]) + torch._dynamo.mark_static(kv[1]) + hidden_states = self.torchair_compiled_model( + input_ids=self.input_ids[:num_input_tokens], + positions=self.positions[:num_input_tokens], + previous_hidden_states=self. + hidden_states[:num_input_tokens], + inputs_embeds=None, + **model_kwargs) + else: + hidden_states = self.model( + input_ids=self.input_ids[:num_input_tokens], + positions=self.positions[:num_input_tokens], + previous_hidden_states=self. + hidden_states[:num_input_tokens], + attn_metadata=attn_metadata, + kv_caches=self.runner.kv_caches[-1:]) sample_hidden_states = hidden_states[last_token_indices] logits = self.model.compute_logits(sample_hidden_states, None) draft_token_ids = logits.argmax(dim=-1) @@ -202,6 +280,49 @@ def load_model(self) -> None: self.model)) process_weights_after_loading(self.model, draft_model_config, target_device) + if self.runner.torchair_graph_enabled and self.is_mtp_torchair_ready: + import torchair # type: ignore + from torchair import patch_for_hcom # type: ignore + + patch_for_hcom() + config = torchair.CompilerConfig() + config.experimental_config.frozen_parameter = True + config.experimental_config.tiling_schedule_optimize = True + torch.npu.set_compile_mode(jit_compile=False) + if not self.runner.use_cached_npu_graph: + npu_backend = torchair.get_npu_backend(compiler_config=config) + self.torchair_compiled_model = torch.compile( + self.model, + dynamic=True, + fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, + backend=npu_backend) + else: + self.torchair_compiled_model = torchair.inference.cache_compile( + self.model.forward, + dynamic=True, + fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, + config=config, + ge_cache=False) + + @torch.inference_mode() + def dummy_run( + self, + num_tokens: int, + with_prefill: bool = False, + ) -> None: + if self.runner.torchair_graph_enabled and not with_prefill: + attn_metadata = self.runner.attn_metadata_builder.build_torchair_graph_dummy( + num_reqs=num_tokens, num_actual_tokens=1, is_mtp_model=True) + else: + attn_metadata = self.runner.attn_metadata_builder.build_torchair_graph_dummy( + num_reqs=num_tokens, num_actual_tokens=1, is_mtp_model=True) + with set_ascend_forward_context(None, + self.vllm_config, + num_tokens=num_tokens): + self.model(input_ids=self.input_ids[:num_tokens], + positions=self.positions[:num_tokens], + previous_hidden_states=self.hidden_states[:num_tokens], + attn_metadata=attn_metadata) # TODO Using torch instead of triton may result in poor performance diff --git a/vllm_ascend/worker/pooling_model_runner.py b/vllm_ascend/worker/pooling_model_runner.py index e1262fb0a2..5047a0f106 100644 --- a/vllm_ascend/worker/pooling_model_runner.py +++ b/vllm_ascend/worker/pooling_model_runner.py @@ -21,13 +21,13 @@ import torch from vllm.distributed import get_pp_group -from vllm.forward_context import set_forward_context from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.multimodal import MultiModalKwargs from vllm.pooling_params import PoolingParams from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) +from vllm_ascend.ascend_forward_context import set_ascend_forward_context from vllm_ascend.worker.model_runner import (ModelInputForNPU, ModelInputForNPUBuilder, NPUModelRunnerBase) @@ -142,8 +142,8 @@ def execute_model( if model_input.token_types is not None: cross_enc_kwargs["token_type_ids"] = model_input.token_types - with set_forward_context(model_input.attn_metadata, self.vllm_config, - virtual_engine): + with set_ascend_forward_context(model_input.attn_metadata, + self.vllm_config, virtual_engine): hidden_or_intermediate_states = model_executable( input_ids=model_input.input_tokens, positions=model_input.input_positions, diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index e78cc3f1cf..80f7c4a78d 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -49,9 +49,8 @@ from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.device_allocator.camem import CaMemAllocator -from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel from vllm_ascend.platform import NPUPlatform -from vllm_ascend.utils import try_register_lib +from vllm_ascend.utils import init_ascend_soc_version, try_register_lib from vllm_ascend.worker.model_runner import NPUModelRunner from vllm_ascend.worker.pooling_model_runner import NPUPoolingModelRunner @@ -218,6 +217,7 @@ def init_device(self) -> None: else: raise RuntimeError( f"Not support device type: {self.device_config.device}") + init_ascend_soc_version() # Initialize the distributed environment. self._init_worker_distributed_environment(self.vllm_config, self.rank, self.distributed_init_method, @@ -545,11 +545,6 @@ def _init_worker_distributed_environment( ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) - init_ascend_model_parallel( - parallel_config.expert_parallel_size, - parallel_config.expert_tensor_parallel_size, - parallel_config.world_size_across_dp, - ) ensure_kv_transfer_initialized(vllm_config) diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 48675ed341..f9ad57e0b5 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -40,9 +40,8 @@ from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.device_allocator.camem import CaMemAllocator -from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel from vllm_ascend.platform import NPUPlatform -from vllm_ascend.utils import try_register_lib +from vllm_ascend.utils import init_ascend_soc_version, try_register_lib from vllm_ascend.worker.model_runner_v1 import NPUModelRunner @@ -75,6 +74,9 @@ def __init__( is_driver_worker=is_driver_worker) # Try to import mindie_turbo to accelerate vLLM inference. + local_dp_rank = self.vllm_config.parallel_config.data_parallel_rank_local + world_size = self.vllm_config.parallel_config.world_size + self.local_rank_across_dp = local_dp_rank * world_size + self.local_rank try_register_lib( "mindie_turbo", "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo." @@ -125,6 +127,7 @@ def init_device(self): info = f"Not support device type: {self.device_config.device}" logger.error(info) raise RuntimeError(info) + init_ascend_soc_version() # Initialize the distributed environment. self._init_worker_distributed_environment() # Set random seed. @@ -202,7 +205,6 @@ def compile_or_warm_up_model(self) -> None: for size in sorted(warmup_sizes, reverse=True): logger.info("Compile and warming up model for size %d", size) self.model_runner._dummy_run(size) - if not self.model_config.enforce_eager: self.model_runner.capture_model() # Reset the seed to ensure that the random state is not affected by @@ -252,22 +254,10 @@ def pin_lora(self, lora_id: int) -> bool: return self.model_runner.pin_lora(lora_id) def execute_dummy_batch(self) -> None: - runner = self.model_runner - max_num_tokens = 1 - with_prefill = False - if runner.dp_size > 1: - max_num_tokens, with_prefill = runner._get_forward_metadata_across_dp( - max_num_tokens, with_prefill) - if runner.torchair_graph_enabled and not with_prefill: - max_num_tokens = runner.select_torchair_padded_batch_size( - max_num_tokens) - runner._dummy_run(max_num_tokens, - is_compile=False, - with_prefill=with_prefill) + self.model_runner._dummy_run(1) def _init_worker_distributed_environment(self) -> None: """Initialize the distributed environment.""" - parallel_config = self.vllm_config.parallel_config set_custom_all_reduce( not self.parallel_config.disable_custom_all_reduce) init_distributed_environment(self.parallel_config.world_size, @@ -276,11 +266,6 @@ def _init_worker_distributed_environment(self) -> None: ensure_model_parallel_initialized( self.parallel_config.tensor_parallel_size, self.parallel_config.pipeline_parallel_size) - init_ascend_model_parallel( - parallel_config.expert_parallel_size, - parallel_config.expert_tensor_parallel_size, - parallel_config.world_size_across_dp, - ) ensure_kv_transfer_initialized(self.vllm_config) def _init_profiler(self):