diff --git a/.github/workflows/accuracy_report.yaml b/.github/workflows/accuracy_report.yaml index 1d03e1318..fe1dbd103 100644 --- a/.github/workflows/accuracy_report.yaml +++ b/.github/workflows/accuracy_report.yaml @@ -19,110 +19,184 @@ name: Accuracy Report on: workflow_dispatch: inputs: - branch: - description: 'choose a dev branch to pr' + vllm-ascend-branch: + description: 'vllm-ascend branch:' required: true - vllm-ascend-version: - description: 'what vllm-ascend version to accuracy test?' + type: choice + options: + - main + - v0.7.3-dev + models: + description: 'models:' required: true - type: string + type: choice + options: + - all + - Qwen/Qwen2.5-7B-Instruct + - Qwen/Qwen2.5-VL-7B-Instruct + - Qwen/Qwen3-8B-Base + default: 'all' + jobs: - download: + download_reports: runs-on: ubuntu-latest + strategy: + matrix: + model: ${{ fromJSON( + (github.event.inputs.models == 'all' && + '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || + (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' && + '["Qwen/Qwen2.5-7B-Instruct"]') || + (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && + '["Qwen/Qwen2.5-VL-7B-Instruct"]') || + (github.event.inputs.models == 'Qwen/Qwen3-8B-Base' && + '["Qwen/Qwen3-8B-Base"]') + ) }} + + version: [0, 1] + exclude: + - model: 'Qwen/Qwen2.5-VL-7B-Instruct' + version: 1 + fail-fast: false + + name: Download ${{ matrix.model }} V${{ matrix.version }} steps: - name: Checkout repository uses: actions/checkout@v4 with: - ref: ${{ github.event.inputs.branch }} - - - name: Debug List Artifacts - run: gh api /repos/${{ github.repository }}/actions/artifacts - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ref: ${{ github.event.inputs.vllm-ascend-branch }} - - name: Query artifact run id for Qwen2.5-VL-7B-Instruct V0 latest artifact - id: get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0 + - name: Get base model name + id: get_basename run: | - ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts") - RUN_ID=$(echo "$ARTIFACT_JSON" | \ - jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-VL-7B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id') - echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT" - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + model_base_name=$(basename "${{ matrix.model }}") + echo "model_base_name=$model_base_name" >> $GITHUB_OUTPUT + shell: bash - - name: Query artifact run id for Qwen2.5-7B-Instruct V0 latest artifact - id: get_Qwen2_5_7B_Instruct_latest_run_id_V0 + - name: Query artifact run id + id: get_run_id run: | - ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts") + ARTIFACT_PATTERN="${{ github.event.inputs.vllm-ascend-branch }}-${{ steps.get_basename.outputs.model_base_name }}-V${{ matrix.version }}-report" + echo "Querying artifacts with pattern: $ARTIFACT_PATTERN" + + ARTIFACT_JSON=$(gh api --paginate /repos/${{ github.repository }}/actions/artifacts || echo "{}") + RUN_ID=$(echo "$ARTIFACT_JSON" | \ - jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-7B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id') - echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT" + jq -s -r --arg pattern "$ARTIFACT_PATTERN" \ + '[.[].artifacts[]] | map(select(.name | test($pattern))) | sort_by(.created_at) | last | .workflow_run.id // empty') + + if [ -z "$RUN_ID" ]; then + echo "::warning::No artifact found matching pattern $ARTIFACT_PATTERN. Skipping download." + echo "runid=" >> $GITHUB_OUTPUT + else + echo "Found matching artifact with run ID: $RUN_ID" + echo "runid=$RUN_ID" >> $GITHUB_OUTPUT + fi env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Query artifact run id for Qwen3-8B-Base V0 latest artifact - id: get_Qwen3_8B_Base_latest_run_id_V0 - run: | - ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts") - RUN_ID=$(echo "$ARTIFACT_JSON" | \ - jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen3-8B-Base-V0-report")] | sort_by(.created_at) | last | .workflow_run.id') - echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT" - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Download Qwen/Qwen2.5-VL-7B-Instruct V0 Artifact + - name: Download Artifact + if: ${{ steps.get_run_id.outputs.runid != '' }} uses: actions/download-artifact@v4 with: - name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-VL-7B-Instruct-V0-report - path: ./docs/source/developer_guide/evaluation/accuracy_report - github-token: ${{ secrets.GITHUB_TOKEN }} - repository: vllm-project/vllm-ascend - run-id: ${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }} + name: ${{ github.event.inputs.vllm-ascend-branch }}-${{ steps.get_basename.outputs.model_base_name }}-V${{ matrix.version }}-report + path: ./docs/source/developer_guide/evaluation/accuracy_report_bak + github-token: ${{ secrets.GITHUB_TOKEN }} + repository: ${{ github.repository }} + run-id: ${{ steps.get_run_id.outputs.runid }} + + - name: Upload reports artifact + if: ${{ steps.get_run_id.outputs.runid != '' }} + uses: actions/upload-artifact@v4 + with: + name: report-${{ steps.get_basename.outputs.model_base_name }}-v${{ matrix.version }} + path: ./docs/source/developer_guide/evaluation/accuracy_report_bak/*.md + retention-days: 90 - - name: Download Qwen/Qwen2.5-7B-Instruct Artifact - uses: actions/download-artifact@v4 + create_pr: + runs-on: ubuntu-latest + needs: download_reports + steps: + - name: Checkout repository + uses: actions/checkout@v4 with: - name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-7B-Instruct-V0-report - path: ./docs/source/developer_guide/evaluation/accuracy_report - github-token: ${{ secrets.GITHUB_TOKEN }} - repository: vllm-project/vllm-ascend - run-id: ${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }} + ref: ${{ github.event.inputs.vllm-ascend-branch }} + + - name: Setup workspace + run: mkdir -p ./accuracy/accuracy_report - - name: Download Qwen/Qwen3-8B-Base Artifact + - name: Download only current run reports uses: actions/download-artifact@v4 with: - name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen3-8B-Base-V0-report path: ./docs/source/developer_guide/evaluation/accuracy_report + pattern: report-* github-token: ${{ secrets.GITHUB_TOKEN }} - repository: vllm-project/vllm-ascend - run-id: ${{ steps.get_Qwen3_8B_Base_latest_run_id_V0.outputs.runid }} + run-id: ${{ github.run_id }} + + - name: Delete old report + run: | + find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete + find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \; + find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete - - name: Display Files - working-directory: ./docs/source/developer_guide/evaluation/accuracy_report + - name: Generate step summary + if: ${{ always() }} run: | - cat ./Qwen2.5-VL-7B-Instruct.md - cat ./Qwen2.5-7B-Instruct.md - cat ./Qwen3-8B-Base.md - - - name: Create Pull Request for markdown update + for report in ./docs/source/developer_guide/evaluation/accuracy_report/*.md; do + filename=$(basename "$report") + # skip index.md + if [ "$filename" = "index.md" ]; then + continue + fi + + if [ -f "$report" ]; then + { + echo -e "\n\n---\n" + echo "## 📄 Report File: $(basename $report)" + cat "$report" + } >> "$GITHUB_STEP_SUMMARY" + fi + done + + - name: Update accuracy_report/index.md + run: | + REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report" + INDEX_MD="$REPORT_DIR/index.md" + + { + echo "# Accuracy Report" + echo "" + echo "::: {toctree}" + echo ":caption: Accuracy Report" + echo ":maxdepth: 1" + + for report in "$REPORT_DIR"/*.md; do + filename="$(basename "$report" .md)" + if [ "$filename" != "index" ]; then + echo "$filename" + fi + done + + echo ":::" + } > "$INDEX_MD" + + - name: Create Pull Request uses: peter-evans/create-pull-request@v7 with: token: ${{ secrets.PR_TOKEN }} - base: ${{ github.event.inputs.branch }} - branch: auto-pr/accuracy-test - commit-message: "Update accuracy report for ${{ github.event.inputs.branch }}" + base: ${{ github.event.inputs.vllm-ascend-branch }} + branch: auto-pr/accuracy-report + commit-message: "Update accuracy reports for ${{ github.event.inputs.vllm-ascend-branch }}" add-paths: ./docs/source/developer_guide/evaluation/accuracy_report/*.md - title: "[Doc]Update accuracy report for ${{ github.event.inputs.branch }}" + title: "[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-branch }}" body: | - The accuracy results running on Ascend NPU have changed, I'm updating the report. - Please review the changes. - + The accuracy results running on NPU Altlas A2 have changed, updating reports for: + ${{ + github.event.inputs.models == 'all' + && 'All models (Qwen2.5-7B-Instruct, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)' + || github.event.inputs.models + }} + - [Workflow run][1] - - [Qwen2.5-7B-Instruct accuracy report][2] - - [Qwen2.5-VL-7B-Instruct accuracy report][3] - - [Qwen3-8B-Base accuracy report][4] - - [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - [2]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }} - [3]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }} - [4]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen3_8B_Base_latest_run_id_V0.outputs.runid }} + + [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} \ No newline at end of file diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index d0a7debaa..999fb6ad5 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -34,8 +34,7 @@ on: # Current supported vLLM versions options: - main - - v0.9.0.1 - - v0.9.0 + - v0.9.1 - v0.7.3 vllm-ascend-version: description: 'vllm-ascend version:' @@ -96,7 +95,7 @@ jobs: # - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct model_name: ${{ fromJSON( (github.event.inputs.models == 'all' && - '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","model_name":"Qwen/Qwen3-8B-Base"]') || + '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') || (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' && '["Qwen/Qwen2.5-7B-Instruct"]') || (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && @@ -159,7 +158,7 @@ jobs: repository: vllm-project/vllm path: ./vllm-empty # Please also update this when bump matched version - ref: ${{ github.event.inputs.vllm-version || 'v0.9.0' }} + ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }} - name: Install vllm-project/vllm from source working-directory: ./vllm-empty @@ -201,6 +200,7 @@ jobs: pip show torch | grep "Version:" | awk '{print "GHA_TORCH_VERSION="$2}' pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}' pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//' + echo "GHA_VLLM_ASCEND_VERSION=${{ github.event.inputs.vllm-ascend-version || github.ref }}" } >> "$GITHUB_ENV" - name: Print versions @@ -209,7 +209,7 @@ jobs: echo "Torch NPU: ${{ env.GHA_TORCH_NPU_VERSION }}" echo "Torch: ${{ env.GHA_TORCH_VERSION }}" echo "vLLM: ${{ env.GHA_VLLM_VERSION }}" - echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}" + echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION }}" - name: Run Accuracy Test for V${{ matrix.vllm_use_version }} id: report @@ -238,10 +238,16 @@ jobs: run: | cat ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md >> $GITHUB_STEP_SUMMARY + - name: Sanitize version string for artifact naming + run: | + SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}" + echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV" + - name: Upload Report for V${{ matrix.vllm_use_version }} + if: ${{ github.event_name == 'workflow_dispatch' }} uses: actions/upload-artifact@v4 with: - name: "${{ env.GHA_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}-report" + name: "${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}-report" path: ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md if-no-files-found: warn retention-days: 90 diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml deleted file mode 100644 index 91cd9c412..000000000 --- a/.github/workflows/actionlint.yml +++ /dev/null @@ -1,53 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: Lint GitHub Actions workflows -on: - pull_request: - branches: - - 'main' - - '*-dev' - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run actionlint" - env: - SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086 - run: | - echo "::add-matcher::.github/workflows/matchers/actionlint.json" - tools/actionlint.sh -color diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml deleted file mode 100644 index a239d38fa..000000000 --- a/.github/workflows/codespell.yml +++ /dev/null @@ -1,47 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: codespell - -on: - pull_request: - branches: - - 'main' - - '*-dev' - -jobs: - codespell: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Run codespell check - run: | - CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**') - CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn') - - codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}" diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml deleted file mode 100644 index 81cd69afe..000000000 --- a/.github/workflows/mypy.yaml +++ /dev/null @@ -1,68 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: mypy - -on: - pull_request: - branches: - - 'main' - - '*-dev' - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - paths: - - '**/*.py' - - '.github/workflows/mypy.yaml' - - 'tools/mypy.sh' - - 'mypy.ini' - -jobs: - mypy: - runs-on: ubuntu-latest - strategy: - matrix: - # TODO(yikun): Add 3.12 back when torch-npu support 3.12 - python-version: ["3.9", "3.10", "3.11"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu - - - name: Checkout vllm-project/vllm repo - uses: actions/checkout@v4 - with: - repository: vllm-project/vllm - path: vllm-empty - - - name: Install vllm-project/vllm from source - working-directory: vllm-empty - run: | - pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu - VLLM_TARGET_DEVICE=empty pip install . - - - name: Mypy - run: | - echo "::add-matcher::.github/workflows/matchers/mypy.json" - tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 67f8f9096..116912111 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -41,13 +41,18 @@ jobs: test: if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} - name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }} + name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}, use_v1=${{ matrix.vllm_use_v1 }} runs-on: 'linux-arm64-npu-static-8' strategy: matrix: include: + - vllm_branch: v0.9.1 + vllm_ascend_branch: main + vllm_use_v1: 0 - vllm_branch: v0.9.0 vllm_ascend_branch: main + vllm_use_v1: 1 + max-parallel: 1 container: image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10 volumes: @@ -67,6 +72,7 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }} ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }} + VLLM_USE_V1: ${{ matrix.vllm_use_v1 }} steps: - name: Check npu and CANN info run: | @@ -136,7 +142,7 @@ jobs: - name: Install elastic_tool if: github.event_name != 'pull_request' run: | - pip install escli-tool==0.2.1 + pip install escli-tool==0.2.2 - name: Collect pr info from vllm-project/vllm-ascend if: github.event_name != 'pull_request' @@ -160,10 +166,10 @@ jobs: while IFS= read -r line || [[ -n "$line" ]]; do commit_id=${line%% *} commit_title=${line#* } - commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict) - commit_time_no_tz=${commit_time::19} git checkout $commit_id + commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict) + commit_time_no_tz=${commit_time::19} pip install -e . echo "------------------------" @@ -173,17 +179,17 @@ jobs: echo "vllm branch: ${{ matrix.vllm_branch }}" echo "vllm-ascend branch: ${{ matrix.vllm_ascend_branch }}" echo "------------------------" + cd /github/home bash benchmarks/scripts/run-performance-benchmarks.sh # send the result to es - if [[ "${{ github.event_name }}" != "pull request" ]]; then - escli add --vllm_branch ${{ matrix.vllm_branch }} \ - --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \ - --commit_id $commit_id \ - --commit_title "$commit_title" \ - --created_at "$commit_time_no_tz" \ - --res_dir ./benchmarks/results - rm -rf ./benchmarks/results - fi + escli add --vllm_branch ${{ matrix.vllm_branch }} \ + --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \ + --commit_id $commit_id \ + --commit_title "$commit_title" \ + --created_at "$commit_time_no_tz" \ + --res_dir ./benchmarks/results \ + --extra_feat '{"VLLM_USE_V1": "${{ matrix.vllm_use_v1 }}"}' + rm -rf ./benchmarks/results cd - done < commit_log.txt diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml deleted file mode 100644 index a19e6a89d..000000000 --- a/.github/workflows/ruff.yml +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: ruff - -on: - pull_request: - branches: - - 'main' - - '*-dev' - -jobs: - ruff: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Analysing the code with ruff - run: | - echo "::add-matcher::.github/workflows/matchers/ruff.json" - ruff check --output-format github . - - name: Run isort - run: | - isort . --check-only diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 49a85f5be..9245820ff 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # -name: 'e2e test' +name: 'test' on: pull_request: @@ -29,6 +29,12 @@ on: - '!docs/**' - 'pytest.ini' - '!benchmarks/**' + - 'tools/mypy.sh' + - 'mypy.ini' + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' + # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. # It's used to activate ascend-toolkit environment variables. @@ -37,12 +43,77 @@ defaults: shell: bash -el {0} jobs: - test: + lint: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Run codespell check + run: | + CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**') + CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn') + + codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}" + - name: Analysing the code with ruff + run: | + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . + - name: Run isort + run: | + isort . --check-only + - name: Running yapf + run: | + python -m pip install --upgrade pip + pip install toml + pip install yapf==0.32.0 + yapf --diff --recursive . + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: vllm-empty + + - name: Actionlint Check + env: + SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086 + run: | + echo "::add-matcher::.github/workflows/matchers/actionlint.json" + tools/actionlint.sh -color + + - name: Install vllm-project/vllm from source + working-directory: vllm-empty + run: | + pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu + VLLM_TARGET_DEVICE=empty pip install . + + - name: Mypy Check + run: | + echo "::add-matcher::.github/workflows/matchers/mypy.json" + tools/mypy.sh 1 ${{ matrix.python-version }} + + e2e: + needs: [lint] + if: ${{ needs.lint.result == 'success' }} strategy: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [v0.9.0, v0.9.1] + vllm_version: [v0.9.1] concurrency: group: > ${{ @@ -107,34 +178,37 @@ jobs: run: | if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py - pytest -sv tests/singlecard/test_scheduler.py # guided decoding doesn't work, fix it later # pytest -sv tests/singlecard/test_guided_decoding.py.py # test_ascend_config.py should be ran separately because it will regenerate the global config many times. pytest -sv tests/singlecard/test_ascend_config.py pytest -sv tests/singlecard/test_camem.py + pytest -sv tests/singlecard/core/test_ascend_scheduler.py + pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py pytest -sv tests/singlecard/ \ --ignore=tests/singlecard/test_offline_inference.py \ - --ignore=tests/singlecard/test_scheduler.py \ --ignore=tests/singlecard/test_guided_decoding.py \ --ignore=tests/singlecard/test_ascend_config.py \ - --ignore=tests/singlecard/test_camem.py + --ignore=tests/singlecard/test_camem.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py else pytest -sv tests/multicard/test_ilama_lora_tp2.py # To avoid oom, we need to run the test in a single process. VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py fi - name: Run vllm-project/vllm-ascend test on V0 engine + if: ${{ github.event_name == 'schedule' }} env: VLLM_USE_V1: 0 run: | if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py - pytest -sv tests/singlecard/test_scheduler.py # guided decoding doesn't work, fix it later # pytest -sv tests/singlecard/test_guided_decoding.py.py pytest -sv tests/singlecard/test_camem.py @@ -143,11 +217,12 @@ jobs: pytest -sv tests/singlecard/test_prompt_embedding.py pytest -sv tests/singlecard/ \ --ignore=tests/singlecard/test_offline_inference.py \ - --ignore=tests/singlecard/test_scheduler.py \ --ignore=tests/singlecard/test_guided_decoding.py \ --ignore=tests/singlecard/test_camem.py \ --ignore=tests/singlecard/test_ascend_config.py \ - --ignore=tests/singlecard/test_prompt_embedding.py + --ignore=tests/singlecard/test_prompt_embedding.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler.py \ + --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py else pytest -sv tests/multicard/test_ilama_lora_tp2.py # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error. @@ -155,5 +230,6 @@ jobs: VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk + VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8 VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py fi diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml index 8afb95048..7ac0a825b 100644 --- a/.github/workflows/vllm_ascend_test_long_term.yaml +++ b/.github/workflows/vllm_ascend_test_long_term.yaml @@ -40,7 +40,7 @@ jobs: max-parallel: 2 matrix: os: [linux-arm64-npu-1, linux-arm64-npu-4] - vllm_version: [v0.9.0, v0.9.1] + vllm_version: [v0.9.1] name: vLLM Ascend long term test runs-on: ${{ matrix.os }} container: @@ -94,7 +94,8 @@ jobs: if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then # spec decode test VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py - VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_spec_decode.py + # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed + # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_spec_decode.py VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_mtp_correctness.py # it needs a clean process pytest -sv tests/long_term/spec_decode --ignore=tests/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py pytest -sv tests/long_term/test_accuracy.py diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml index 0de616d4f..e44d60704 100644 --- a/.github/workflows/vllm_ascend_test_pd.yaml +++ b/.github/workflows/vllm_ascend_test_pd.yaml @@ -38,7 +38,7 @@ jobs: if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }} strategy: matrix: - vllm_verison: [v0.9.0, v0.9.1] + vllm_verison: [v0.9.1] name: vLLM Ascend prefilling decoding disaggregation test runs-on: linux-arm64-npu-static-8 diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml deleted file mode 100644 index 64497d156..000000000 --- a/.github/workflows/yapf.yml +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Adapted from vllm-project/vllm/blob/main/.github -# - -name: yapf - -on: - pull_request: - branches: - - 'main' - - '*-dev' - paths: - - "**/*.py" - - .github/workflows/yapf.yml - -jobs: - yapf: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install toml - pip install yapf==0.32.0 - - name: Running yapf - run: | - yapf --diff --recursive . diff --git a/CMakeLists.txt b/CMakeLists.txt index a2c3ad2e1..8d06c75f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,5 +96,3 @@ target_link_libraries( target_link_options(vllm_ascend_C PRIVATE "-Wl,-rpath,$ORIGIN:$ORIGIN/lib") install(TARGETS vllm_ascend_C vllm_ascend_kernels DESTINATION ${VLLM_ASCEND_INSTALL_PATH}) - - diff --git a/Dockerfile b/Dockerfile index 1dfd10c86..952e77fe9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.0 +ARG VLLM_TAG=v0.9.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index ffd1174d2..2ff3d0b39 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_TAG=v0.9.0 +ARG VLLM_TAG=v0.9.1 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. diff --git a/benchmarks/ops/ben_vocabparallelembedding.py b/benchmarks/ops/ben_vocabparallelembedding.py new file mode 100644 index 000000000..e91cfed7b --- /dev/null +++ b/benchmarks/ops/ben_vocabparallelembedding.py @@ -0,0 +1,144 @@ +from typing import Tuple + +import numpy as np +import pytest +import torch +import torch_npu # noqa: F401 +import vllm # noqa: F401 + +import vllm_ascend.platform # noqa: F401 + + +def benchmark_npu(fn, num_iterations=100, num_warmup_iterations=50): + """ + Benchmark function for NPU operations + + Args: + fn: Function to benchmark + num_iterations: Number of timing iterations + num_warmup_iterations: Number of warmup iterations + + Returns: + float: Minimum elapsed time in seconds + """ + start = torch.npu.Event(enable_timing=True) + end = torch.npu.Event(enable_timing=True) + times = np.zeros(num_iterations + num_warmup_iterations) + + # Run iterations + for i in range(num_warmup_iterations + num_iterations): + with torch.no_grad(): + start.record() + fn() # Execute the function + end.record() + torch.npu.synchronize() + times[i] = start.elapsed_time(end) + + # Remove warmup iterations and convert to seconds + times = times[num_warmup_iterations:] + elapsed_time = np.amin(times) / 1000 + return elapsed_time + + +def get_masked_input_and_mask_ref( + input_: torch.Tensor, org_vocab_start_index: int, + org_vocab_end_index: int, num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Reference implementation for verification""" + org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < + org_vocab_end_index) + added_vocab_mask = (input_ >= added_vocab_start_index) & ( + input_ < added_vocab_end_index) + added_offset = added_vocab_start_index - ( + org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding + valid_offset = (org_vocab_start_index * + org_vocab_mask) + (added_offset * added_vocab_mask) + vocab_mask = org_vocab_mask | added_vocab_mask + masked_input = vocab_mask * (input_ - valid_offset) + return masked_input, ~vocab_mask + + +DTYPES = [torch.int32] +SHAPES = [(3, 4, 5)] +DEVICES = [f"npu:{0}"] +SEEDS = [0] + + +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEEDS) +@torch.inference_mode() +def test_get_masked_input_and_mask( + shape: Tuple[int, ...], + dtype: torch.dtype, + device: str, + seed: int, +) -> None: + # Set random seed and device + torch.manual_seed(seed) + torch.set_default_device(device) + + # Generate random input tensor + input_tensor = torch.randint(0, 1000, shape, dtype=dtype) + + # Test parameters + test_case = { + "org_start": 100, + "org_end": 200, + "padding": 0, + "added_start": 300, + "added_end": 400, + } + + # Define reference function + def ref_fn(): + return get_masked_input_and_mask_ref(input_tensor, + test_case["org_start"], + test_case["org_end"], + test_case["padding"], + test_case["added_start"], + test_case["added_end"]) + + # Define custom function + def custom_fn(): + return torch.ops._C.get_masked_input_and_mask(input_tensor, + test_case["org_start"], + test_case["org_end"], + test_case["padding"], + test_case["added_start"], + test_case["added_end"]) + + # Get results for correctness testing + ref_masked_input, ref_mask = ref_fn() + custom_masked_input, custom_mask = custom_fn() + + # Benchmark both implementations + ref_time = benchmark_npu(ref_fn) + custom_time = benchmark_npu(custom_fn) + + # Print performance results + print("\nPerformance Results:") + print(f"Reference implementation: {ref_time*1000:.3f} ms") + print(f"Custom implementation: {custom_time*1000:.3f} ms") + print(f"Speedup: {ref_time/custom_time:.2f}x") + + # Compare results for correctness + ref_masked_input = ref_masked_input.to(dtype) + print("\nResults comparison:") + print("custom_masked_input:", custom_masked_input) + print("ref_masked_input:", ref_masked_input) + print("custom_mask:", custom_mask) + print("ref_mask:", ref_mask) + torch.testing.assert_close( + custom_masked_input, + ref_masked_input, + rtol=1e-5, + atol=1e-5, + msg=f"Masked input mismatch for case: {test_case}") + torch.testing.assert_close(custom_mask, + ref_mask, + rtol=1e-5, + atol=1e-5, + msg=f"Mask mismatch for case: {test_case}") diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py index f508feda7..79c58bcbc 100644 --- a/benchmarks/scripts/run_accuracy.py +++ b/benchmarks/scripts/run_accuracy.py @@ -26,7 +26,7 @@ import lm_eval import torch -UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B"] +UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B-Base"] UNIMODAL_TASK = ["ceval-valid", "gsm8k"] MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"] MULTIMODAL_TASK = ["mmmu_val"] @@ -36,17 +36,17 @@ MODEL_RUN_INFO = { "Qwen/Qwen2.5-7B-Instruct": ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" - "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n" + "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen3-8B-Base": ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n" - "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n" + "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1" ), "Qwen/Qwen2.5-VL-7B-Instruct": ("export MODEL_ARGS='pretrained={model}, max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2'\n" - "lm_eval --model vllm-vlm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n" + "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n" "--apply_chat_template --fewshot_as_multiturn --batch_size 1"), } diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json index 576ced213..40cec4c5f 100644 --- a/benchmarks/tests/latency-tests.json +++ b/benchmarks/tests/latency-tests.json @@ -9,5 +9,15 @@ "num_iters_warmup": 5, "num_iters": 15 } + }, + { + "test_name": "latency_qwen2_5_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } } ] diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json index d8ad2be2b..c8d5cdaf9 100644 --- a/benchmarks/tests/serving-tests.json +++ b/benchmarks/tests/serving-tests.json @@ -49,5 +49,29 @@ "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } + }, + { + "test_name": "serving_qwen2_5_7B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } } ] diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json index 551d23882..3698e69f3 100644 --- a/benchmarks/tests/throughput-tests.json +++ b/benchmarks/tests/throughput-tests.json @@ -22,6 +22,17 @@ "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", "num_prompts": 200 } + }, + { + "test_name": "throughput_qwen2_5_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } } ] diff --git a/csrc/kernels/get_masked_input_and_mask_kernel.cpp b/csrc/kernels/get_masked_input_and_mask_kernel.cpp new file mode 100644 index 000000000..47ce826f2 --- /dev/null +++ b/csrc/kernels/get_masked_input_and_mask_kernel.cpp @@ -0,0 +1,345 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + */ + +#include "kernel_operator.h" +#include "kernel_tensor_impl.h" +#include "kernel_type.h" +#include "types.h" +#include "utils.h" +using vllm_ascend::AccType; + +template +class GetMaskedInputAndMask { +public: + __aicore__ inline GetMaskedInputAndMask() {} + + __aicore__ inline ~GetMaskedInputAndMask() { + pipe.Reset(); + } + + + __aicore__ inline void Init( + __gm__ scalar_t* input, + __gm__ scalar_t* masked_input, + __gm__ bool* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size) + { + // Initialize basic parameters + input_ = input; + masked_input_ = masked_input; + mask_out_ = mask_out; + org_vocab_start_index_ = org_vocab_start_index; + org_vocab_end_index_ = org_vocab_end_index; + size_ = ((size + 31) / 32) * 32; + added_offset_ = added_vocab_start_index - + (org_vocab_end_index - org_vocab_start_index) - + num_org_vocab_padding; + added_vocab_start_index_ = added_vocab_start_index; + added_vocab_end_index_ = added_vocab_end_index; + + // Initialize global tensors + inputGlobal.SetGlobalBuffer(input); + maskedOutputGlobal.SetGlobalBuffer(masked_input); + maskOutGlobal.SetGlobalBuffer(mask_out); + + // Initialize queues + pipe.InitBuffer(inQueue, 1, size_ * sizeof(scalar_t)); + pipe.InitBuffer(outQueue, 1, size_ * sizeof(scalar_t)); + pipe.InitBuffer(maskQueue, 1, size_ * sizeof(bool)); + + // Initialize calculation buffers + pipe.InitBuffer(calc_buf_1, size_ * sizeof(float)); + pipe.InitBuffer(calc_buf_2, size_ * sizeof(float)); + + // Initialize result queues + pipe.InitBuffer(result_ge_que, BUFFER_NUM, size_ * sizeof(float)); + pipe.InitBuffer(result_le_que, BUFFER_NUM, size_ * sizeof(float)); + pipe.InitBuffer(result_org_mask_que, BUFFER_NUM, size_ * sizeof(float)); + pipe.InitBuffer(result_add_mask_que, BUFFER_NUM, size_ * sizeof(float)); + + // Initialize temporary buffers + pipe.InitBuffer(start_buf, size_ * sizeof(float)); + pipe.InitBuffer(end_buf, size_ * sizeof(float)); + pipe.InitBuffer(inputFloat_buf, size_ * sizeof(float)); + pipe.InitBuffer(validOffset_buf, size_ * sizeof(float)); + pipe.InitBuffer(vocabMask_buf_, size_ * sizeof(int8_t)); + pipe.InitBuffer(ones_buf_, size_ * sizeof(float)); + } + + __aicore__ inline void Process() + { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + __aicore__ inline void CopyIn() + { + AscendC::LocalTensor inputLocal = inQueue.AllocTensor(); + AscendC::DataCopy(inputLocal, inputGlobal, size_); + inQueue.EnQue(inputLocal); + } + + __aicore__ inline void CompareWithValue( + AscendC::LocalTensor& result, + const AscendC::LocalTensor& input, + const AscendC::LocalTensor& compare_value, + bool is_greater_equal) { + + AscendC::LocalTensor compute_buf = calc_buf_1.Get(); + if (is_greater_equal) { + AscendC::Max(compute_buf, input, compare_value, size_); + AscendC::Sub(compute_buf, compare_value, compute_buf, size_); + } else { + AscendC::Max(compute_buf, input, compare_value, size_); + AscendC::Sub(compute_buf, compute_buf, compare_value, size_); + } + + AscendC::Abs(compute_buf, compute_buf, size_); + AscendC::Mins(compute_buf, compute_buf, MIN_ACCURACY_FP32, size_); + AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_); + AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_); + AscendC::Muls(compute_buf, compute_buf, MAX_MUL_2_FP32, size_); + AscendC::Adds(compute_buf, compute_buf, NEGATIVE_ONE_FP32, size_); + AscendC::Abs(compute_buf, compute_buf, size_); + + AscendC::LocalTensor compute_buf_fp16 = calc_buf_2.Get(); + AscendC::Cast(compute_buf_fp16, compute_buf, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(result, compute_buf_fp16, AscendC::RoundMode::CAST_NONE, size_); + } + + __aicore__ inline void ComputeRangeMask( + AscendC::LocalTensor& range_mask, + const AscendC::LocalTensor& input, + const float start_value, + const float end_value) { + + // Use already initialized buffers + AscendC::LocalTensor start_value_tensor = start_buf.Get(); + AscendC::LocalTensor end_value_tensor = end_buf.Get(); + + AscendC::Duplicate(start_value_tensor, start_value, size_); + AscendC::Duplicate(end_value_tensor, end_value, size_); + + AscendC::LocalTensor ge_result = result_ge_que.AllocTensor(); + AscendC::LocalTensor lt_result = result_le_que.AllocTensor(); + + CompareWithValue(ge_result, start_value_tensor, input, true); + CompareWithValue(lt_result, input, end_value_tensor, false); + + AscendC::And(range_mask, ge_result, lt_result, size_); + } + + __aicore__ inline void Compute() { + AscendC::LocalTensor inputLocal = inQueue.DeQue(); + AscendC::LocalTensor maskedLocal = outQueue.AllocTensor(); + AscendC::LocalTensor maskLocal = maskQueue.AllocTensor(); + + AscendC::LocalTensor inputFloat = inputFloat_buf.Get(); + AscendC::Cast(inputFloat, inputLocal, AscendC::RoundMode::CAST_NONE, size_); + + // Calculate mask for org_vocab range + // org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) + AscendC::LocalTensor orgVocabMask = result_org_mask_que.AllocTensor(); + ComputeRangeMask(orgVocabMask, + inputFloat, + static_cast(org_vocab_start_index_), + static_cast(org_vocab_end_index_)); + + // Calculate mask for added_vocab range + // added_vocab_mask = (input_ >= added_vocab_start_index) & (input_ < added_vocab_end_index) + AscendC::LocalTensor addedVocabMask = result_add_mask_que.AllocTensor(); + ComputeRangeMask(addedVocabMask, + inputFloat, + static_cast(added_vocab_start_index_), + static_cast(added_vocab_end_index_)); + + // Calculate validOffset + // valid_offset = (org_vocab_start_index * org_vocab_mask) + (added_offset * added_vocab_mask) + AscendC::LocalTensor validOffset = validOffset_buf.Get(); + AscendC::LocalTensor constOrgStartIndex = start_buf.Get(); + + AscendC::Duplicate(constOrgStartIndex, float(org_vocab_start_index_), size_); + + AscendC::LocalTensor orgVocabMask_fp16; + AscendC::LocalTensor orgVocabMask_fp32; + AscendC::Cast(orgVocabMask_fp16, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(orgVocabMask_fp32, orgVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Mul(validOffset, + constOrgStartIndex, + orgVocabMask_fp32, + size_); + + AscendC::LocalTensor addedOffset; + AscendC::LocalTensor addedOffsetTensor = end_buf.Get(); + AscendC::Duplicate(addedOffsetTensor, float(added_offset_), size_); + + AscendC::LocalTensor addedVocabMask_fp16; + AscendC::LocalTensor addedVocabMask_fp32; + AscendC::Cast(addedVocabMask_fp16, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(addedVocabMask_fp32, addedVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Mul(addedOffset, + addedOffsetTensor, + addedVocabMask_fp32, + size_); + + AscendC::Add(validOffset, validOffset, addedOffset, size_); + + // vocab_mask = org_vocab_mask | added_vocab_mask + AscendC::LocalTensor vocabMask = vocabMask_buf_.Get(); + + AscendC::Or(vocabMask, + orgVocabMask, + addedVocabMask, + size_); + + AscendC::Sub(inputFloat, inputFloat, validOffset, size_); + + // input_ = vocab_mask * (input_ - valid_offset) + AscendC::LocalTensor vocabMask_fp16; + AscendC::LocalTensor vocabMask_fp32; + AscendC::Cast(vocabMask_fp16, vocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(vocabMask_fp32, vocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::LocalTensor inputFloat_fp32; + AscendC::Mul(inputFloat, inputFloat, vocabMask_fp32, size_); + + AscendC::Cast(maskedLocal, inputFloat, AscendC::RoundMode::CAST_CEIL, size_); + outQueue.EnQue(maskedLocal); + + // ~vocab_mask + AscendC::LocalTensor ones_tensor = ones_buf_.Get(); + AscendC::Duplicate(ones_tensor, (float)1, size_); + AscendC::LocalTensor maskLocal_fp32; + + AscendC::Sub(maskLocal_fp32, + ones_tensor, + vocabMask_fp32, + size_); + + AscendC::LocalTensor maskLocal_fp16; + AscendC::Cast(maskLocal_fp16, maskLocal_fp32, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(maskLocal, maskLocal_fp16, AscendC::RoundMode::CAST_NONE, size_); + maskQueue.EnQue(maskLocal); + inQueue.FreeTensor(inputLocal); + } + + __aicore__ inline void CopyOut() + { + AscendC::LocalTensor maskedLocal = outQueue.DeQue(); + AscendC::LocalTensor maskLocal = maskQueue.DeQue(); + + AscendC::DataCopy(maskedOutputGlobal, maskedLocal, size_); + AscendC::DataCopy(maskOutGlobal, maskLocal, size_); + + outQueue.FreeTensor(maskedLocal); + maskQueue.FreeTensor(maskLocal); + } + +private: + static constexpr int32_t BUFFER_NUM = 2; + AscendC::TPipe pipe; + AscendC::TQue inQueue; + AscendC::TQue outQueue, maskQueue; + AscendC::GlobalTensor inputGlobal, maskedOutputGlobal; + AscendC::GlobalTensor maskOutGlobal; + AscendC::TBuf calc_buf_1; + AscendC::TBuf calc_buf_2; + AscendC::TQue result_ge_que; + AscendC::TQue result_le_que; + AscendC::TQue result_org_mask_que; + AscendC::TQue result_add_mask_que; + + // Temporary buffers + AscendC::TBuf start_buf; + AscendC::TBuf end_buf; + + // Temporary buffers continued + AscendC::TBuf inputFloat_buf; + AscendC::TBuf validOffset_buf; + AscendC::TBuf vocabMask_buf_; + AscendC::TBuf ones_buf_; + + __gm__ scalar_t *input_, *masked_input_; + __gm__ bool *mask_out_; + int64_t size_; + int64_t org_vocab_start_index_, org_vocab_end_index_; + int64_t added_vocab_start_index_, added_vocab_end_index_; + int64_t added_offset_; + + static constexpr float MIN_ACCURACY_FP32 = 1.1754943508222875e-38; + static constexpr float MAX_MUL_1_FP32 = 1125899906842624; + static constexpr float MAX_MUL_2_FP32 = 67108864; + static constexpr float NEGATIVE_ONE_FP32 = -1.0f; +}; + +extern "C" __global__ __aicore__ void get_masked_input_and_mask_kernel( + __gm__ int32_t* input, + __gm__ int32_t* masked_input, + __gm__ bool* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size, + const uint32_t loop_cnt, + const uint32_t aiv_num) +{ + { + GetMaskedInputAndMask op{}; + + for (int64_t i = AscendC::GetBlockIdx(); i < loop_cnt; i += aiv_num) { + op.Init(input + i * size/loop_cnt, + masked_input + i * size/loop_cnt, + mask_out + i * size/loop_cnt, + org_vocab_start_index, org_vocab_end_index, + num_org_vocab_padding, added_vocab_start_index, + added_vocab_end_index, size/loop_cnt); + + op.Process(); + } + } // op destructor called here +} + +namespace vllm_ascend { + +void get_masked_input_and_mask_impl( + void* stream, + void* input, + void* masked_input, + void* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size, + const uint32_t loop_cnt, + const uint32_t aiv_num) +{ + get_masked_input_and_mask_kernel<<>>( + static_cast(input), + static_cast(masked_input), + static_cast(mask_out), + org_vocab_start_index, + org_vocab_end_index, + num_org_vocab_padding, + added_vocab_start_index, + added_vocab_end_index, + size, + loop_cnt, + aiv_num); +} + +} // namespace vllm_ascend + diff --git a/csrc/ops.h b/csrc/ops.h index b921b2bf9..b1bc6028e 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -31,6 +31,20 @@ namespace vllm_ascend { const int headSize, const int64_t numTokens, const uint32_t loopCnt, uint32_t aivNum); + extern void get_masked_input_and_mask_impl( + void* stream, + void* input, + void* masked_input, + void* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size, + const uint32_t loop_cnt, + const uint32_t aiv_num); + torch::Tensor weak_ref_tensor(torch::Tensor& tensor) { if (!tensor.is_privateuseone()) { throw std::runtime_error("Tensor must be on NPU device"); diff --git a/csrc/torch_binding.cpp b/csrc/torch_binding.cpp index c4154381d..001a9314d 100644 --- a/csrc/torch_binding.cpp +++ b/csrc/torch_binding.cpp @@ -99,6 +99,112 @@ std::tuple rotary_embedding(at::Tensor &positions, at::T return {query_dst, key_dst}; } +std::tuple get_masked_input_and_mask( + at::Tensor &input, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index) + /* + https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/vocab_parallel_embedding.py#L161-L198 + Embedding parallelized in the vocabulary dimension. + + Adapted from torch.nn.Embedding, note that we pad the vocabulary size to + make sure it is divisible by the number of model parallel GPUs. + + In order to support various loading methods, we ensure that LoRA-added + embeddings are always at the end of TP-sharded tensors. In other words, + we shard base embeddings and LoRA embeddings separately (both padded), + and place them in the same tensor. + In this example, we will have the original vocab size = 1010, + added vocab size = 16 and padding to 64. Therefore, the total + vocab size with padding will be 1088 (because we first pad 1010 to + 1024, add 16, and then pad to 1088). + Therefore, the tensor format looks like the following: + TP1, rank 0 (no sharding): + |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >| + corresponding token_id: | 0 | 1 | ... | 1009 | -1 | ... | -1 | 1010 | ... | 1015 | -1 | ... | -1 | + index: | 0 | 1 | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 | + + TP2, rank 0: + |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >| + corresponding token_id: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 1000 | ... | 1015 | -1 | ... | -1 | + index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 527 | 520 | ... | 543 | + TP2, rank 1: + |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >| + corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1 | ... | -1 | -1 | ... | -1 | -1 | ... | -1 | + index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 519 | 520 | ... | 543 | + Parameters: + org_vocab_start_index //base embeddings start + org_vocab_end_index //base embeddings end + num_org_vocab_padding //base embeddings padding + added_vocab_start_index //LoRA embeddings start + added_vocab_end_index //LoRA embeddings end + */ +{ + // Input validation + TORCH_CHECK(input.dim() >= 1, "input must have at least 1 dimension"); + TORCH_CHECK(org_vocab_start_index >= 0, "org_vocab_start_index must be non-negative"); + TORCH_CHECK(org_vocab_end_index >= org_vocab_start_index, "org_vocab_end_index must be greater than org_vocab_start_index"); + TORCH_CHECK(num_org_vocab_padding >= 0, "num_org_vocab_padding must be non-negative"); + TORCH_CHECK(added_vocab_start_index >= org_vocab_end_index, "added_vocab_start_index must be greater than org_vocab_end_index"); + TORCH_CHECK(added_vocab_end_index >= added_vocab_start_index, "added_vocab_end_index must be greater than added_vocab_start_index"); + + // Get total number of elements + int64_t size = input.numel(); + + // Create output tensors + at::Tensor masked_input = at::empty_like(input); + at::Tensor mask = at::empty_like(input).to(at::kBool); + + // Get data pointers + void *input_ptr = input.data_ptr(); + void *masked_input_ptr = masked_input.data_ptr(); + void *mask_ptr = mask.data_ptr(); + + // Get current stream + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); + + // Get scalar type + at::ScalarType scalar_type = input.scalar_type(); + + // Create and configure OpCommand + at_npu::native::OpCommand cmd; + cmd.Name("get_masked_input_and_mask"); + cmd.SetCustomHandler([scalar_type, size, stream, + input_ptr, masked_input_ptr, mask_ptr, + org_vocab_start_index, org_vocab_end_index, + num_org_vocab_padding, added_vocab_start_index, + added_vocab_end_index]() -> int { + // Get platform info + fe::PlatFormInfos platform_infos; + int device_id = 0; + fe::PlatformInfoManager::GeInstance().GetRuntimePlatformInfosByDevice(device_id, platform_infos); + uint32_t aivNum = platform_infos.GetCoreNumByType("aiv"); + uint32_t loop_cnt = (size + aivNum - 1) / aivNum; + + // Call implementation + get_masked_input_and_mask_impl( + stream, + input_ptr, + masked_input_ptr, + mask_ptr, + org_vocab_start_index, + org_vocab_end_index, + num_org_vocab_padding, + added_vocab_start_index, + added_vocab_end_index, + size, + loop_cnt, + aivNum); + + return 0; + }); + cmd.Run(); + return {masked_input, mask}; +} + void verify_tensor(std::string const& name, at::Tensor const& t, int64_t const size_0, int64_t const size_1, c10::ScalarType const type) { @@ -194,6 +300,16 @@ TORCH_LIBRARY_EXPAND(_C, ops) " Tensor! key, int head_size," " Tensor cos_sin_cache, bool is_neox) -> (Tensor query, Tensor key)"); ops.impl("rotary_embedding", torch::kPrivateUse1, &vllm_ascend::rotary_embedding); + + ops.def( + "get_masked_input_and_mask(Tensor input, " + " int org_vocab_start_index, " + " int org_vocab_end_index, " + " int num_org_vocab_padding, " + " int added_vocab_start_index, " + " int added_vocab_end_index) -> (Tensor masked_input, Tensor mask)"); + ops.impl("get_masked_input_and_mask", torch::kPrivateUse1, &vllm_ascend::get_masked_input_and_mask); + ops.def( "advance_step_flashattn_ascendc(int num_seqs, int num_queries, int block_size," " Tensor! input_tokens, Tensor! sampled_token_ids, Tensor! input_positions," diff --git a/docs/source/community/contributors.md b/docs/source/community/contributors.md index efbea6657..d61259de8 100644 --- a/docs/source/community/contributors.md +++ b/docs/source/community/contributors.md @@ -1,4 +1,6 @@ -# Maintainers +# Maintainers and contributors + +## Maintainers | Name | Github ID | Date | |:-----------:|:-----:|:-----:| @@ -6,13 +8,30 @@ | Yikun Jiang| [@Yikun](https://github.com/Yikun) | 2025/02 | | Yi Gan| [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/02 | -# Contributors +## Contributors vLLM Ascend every release would not have been possible without the following contributors: +Updated on 2025-06-10: + | Number | Contributor | Date | Commit ID | |:------:|:-----------:|:-----:|:---------:| -| 51 | [@YisongJiang](https://github.com/YisongJiang) | 2025/5/29 | [90afaf6](https://github.com/vllm-project/vllm-ascend/commit/90afaf6306f680307462becf3c78585737579851) | +| 66 | [@Yuxiao-Xu](https://github.com/Yuxiao-Xu) | 2025/6/9 | [6b853f1](https://github.com/vllm-project/vllm-ascend/commit/6b853f15fe69ba335d2745ebcf14a164d0bcc505) | +| 65 | [@ChenTaoyu-SJTU](https://github.com/ChenTaoyu-SJTU) | 2025/6/7 | [20dedba](https://github.com/vllm-project/vllm-ascend/commit/20dedba5d1fc84b7ae8b49f9ce3e3649389e2193) | +| 64 | [@zxdukki](https://github.com/zxdukki) | 2025/6/7 | [87ebaef](https://github.com/vllm-project/vllm-ascend/commit/87ebaef4e4e519988f27a6aa378f614642202ecf) | +| 63 | [@sdmyzlp](https://github.com/sdmyzlp) | 2025/6/7 | [3640c60](https://github.com/vllm-project/vllm-ascend/commit/3640c60b0eb4d4cb104e20bfa406d3f1d17920a7) | +| 62 | [@weijinqian0](https://github.com/weijinqian0) | 2025/6/7 | [e9ada68](https://github.com/vllm-project/vllm-ascend/commit/e9ada685ece798f9fe0d4a287e3f5246a8a7207b) | +| 61 | [@hahazhky](https://github.com/hahazhky) | 2025/6/6 | [0b12c2a](https://github.com/vllm-project/vllm-ascend/commit/0b12c2acf7d9fd192beebebf662298067d9a5435) | +| 60 | [@depeng1994](https://github.com/depeng1994) | 2025/6/6 | [6b094a2](https://github.com/vllm-project/vllm-ascend/commit/6b094a2bd49a8a41eb3647568b2d9e5b337db81f) | +| 59 | [@David9857](https://github.com/David9857) | 2025/6/5 | [78431b3](https://github.com/vllm-project/vllm-ascend/commit/78431b34694dfa3c8f54ed7cc626660318557927) | +| 58 | [@momo609](https://github.com/momo609) | 2025/6/5 | [908a851](https://github.com/vllm-project/vllm-ascend/commit/908a851a776cfd9051cc062119e6ec481561c6f7) | +| 57 | [@zhangxinyuehfad](https://github.com/zhangxinyuehfad) | 2025/6/5 | [7737aaa](https://github.com/vllm-project/vllm-ascend/commit/7737aaa40f699b233a35fb61e908b687adc1e2e5) | +| 56 | [@NINGBENZHE](https://github.com/NINGBENZHE) | 2025/6/3 | [6ec64a3](https://github.com/vllm-project/vllm-ascend/commit/6ec64a3f9686df65b5a23a41aa301e669db19099) | +| 55 | [@XWFAlone](https://github.com/XWFAlone) | 2025/5/30 | [3442fbd](https://github.com/vllm-project/vllm-ascend/commit/3442fbdb235b4c6d72c2bc64a49707a7bd89958e) | +| 54 | [@YisongJiang](https://github.com/YisongJiang) | 2025/5/29 | [90afaf6](https://github.com/vllm-project/vllm-ascend/commit/90afaf6306f680307462becf3c78585737579851) | +| 53 | [@ponix-j](https://github.com/ponix-j) | 2025/5/23 | [df58fb8](https://github.com/vllm-project/vllm-ascend/commit/df58fb80eee24139fc61c495be3ce79cf81b3f73) | +| 52 | [@ttanzhiqiang](https://github.com/ttanzhiqiang) | 2025/5/23 | [dc6172e](https://github.com/vllm-project/vllm-ascend/commit/dc6172efd3860ce95b40a7b3e93611f875f06d40) | +| 51 | [@yangpuPKU](https://github.com/yangpuPKU) | 2025/5/23 | [46df67a](https://github.com/vllm-project/vllm-ascend/commit/46df67a5e9ab73fade08cbb2d8c0155cee7316d1) | | 50 | [@wonderful199082](https://github.com/wonderful199082) | 2025/5/20 | [5cf9ff1](https://github.com/vllm-project/vllm-ascend/commit/5cf9ff18e91b0b7031c258d71a257b8e24689763) | | 49 | [@22dimensions](https://github.com/22dimensions) | 2025/5/17 | [a8730e7](https://github.com/vllm-project/vllm-ascend/commit/a8730e7a3c4ac6c4b39a5946c943252fdea6cce5) | | 48 | [@cxcxflying](https://github.com/cxcxflying) | 2025/5/13 | [e564470](https://github.com/vllm-project/vllm-ascend/commit/e56447033889ca95df512208cab22ef832bfdf07) | diff --git a/docs/source/conf.py b/docs/source/conf.py index ba8d83949..63fb5dbe4 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -69,10 +69,10 @@ # the branch of vllm-ascend, used in vllm-ascend clone and image tag # - main branch: 'main' # - vX.Y.Z branch: latest vllm-ascend release tag - 'vllm_ascend_version': 'v0.9.0rc1', + 'vllm_ascend_version': 'v0.9.0rc2', # the newest release version of vllm-ascend and matched vLLM, used in pip install. # This value should be updated when cut down release. - 'pip_vllm_ascend_version': "0.9.0rc1", + 'pip_vllm_ascend_version': "0.9.0rc2", 'pip_vllm_version': "0.9.0", # CANN image tag 'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10", diff --git a/docs/source/developer_guide/evaluation/accuracy_report/index.md b/docs/source/developer_guide/evaluation/accuracy_report/index.md new file mode 100644 index 000000000..51fae4886 --- /dev/null +++ b/docs/source/developer_guide/evaluation/accuracy_report/index.md @@ -0,0 +1,6 @@ +# Accuracy Report + +:::{toctree} +:caption: Accuracy Report +:maxdepth: 1 +::: \ No newline at end of file diff --git a/docs/source/developer_guide/evaluation/index.md b/docs/source/developer_guide/evaluation/index.md index 324c2e2f2..ea861abb1 100644 --- a/docs/source/developer_guide/evaluation/index.md +++ b/docs/source/developer_guide/evaluation/index.md @@ -6,6 +6,7 @@ using_lm_eval using_opencompass using_evalscope +accuracy_report/index ::: :::{toctree} diff --git a/docs/source/developer_guide/evaluation/profile_execute_duration.md b/docs/source/developer_guide/evaluation/profile_execute_duration.md index 8989bf9cf..3b37df6be 100644 --- a/docs/source/developer_guide/evaluation/profile_execute_duration.md +++ b/docs/source/developer_guide/evaluation/profile_execute_duration.md @@ -9,6 +9,11 @@ The execution duration of each stage (including pre/post-processing, model forwa * Use the non-blocking API `ProfileExecuteDuration().capture_async` to set observation points asynchronously when you need to observe the execution duration. * Use the blocking API `ProfileExecuteDuration().pop_captured_sync` at an appropriate time to get and print the execution durations of all observed stages. +**We have instrumented the key inference stages (including pre-processing, model forward pass, etc.) for execute duration profiling. Execute the script as follows:** +``` +VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE=1 python3 vllm-ascend/examples/offline_inference_npu.py +``` + ## Example Output ``` diff --git a/docs/source/developer_guide/versioning_policy.md b/docs/source/developer_guide/versioning_policy.md index bbb08c3be..1533b0af5 100644 --- a/docs/source/developer_guide/versioning_policy.md +++ b/docs/source/developer_guide/versioning_policy.md @@ -22,6 +22,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin: | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | MindIE Turbo | |-------------|--------------|------------------|-------------|--------------------|--------------| +| v0.9.0rc2 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.9.0rc1 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.8.5rc1 | v0.8.5.post1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | | v0.8.4rc2 | v0.8.4 | >= 3.9, < 3.12 | 8.0.0 | 2.5.1 / 2.5.1 | | @@ -34,6 +35,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin: | Date | Event | |------------|-------------------------------------------| +| 2025.06.10 | Release candidates, v0.9.0rc2 | | 2025.06.09 | Release candidates, v0.9.0rc1 | | 2025.05.29 | v0.7.x post release, v0.7.3.post1 | | 2025.05.08 | v0.7.x Final release, v0.7.3 | @@ -71,6 +73,7 @@ Usually, each minor version of vLLM (such as 0.7) will correspond to a vLLM Asce | Branch | Status | Note | |------------|--------------|--------------------------------------| | main | Maintained | CI commitment for vLLM main branch and vLLM 0.9.x branch | +| v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.0 and 0.9.1 version | | v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version | | v0.7.1-dev | Unmaintained | Replaced by v0.7.3-dev | diff --git a/docs/source/faqs.md b/docs/source/faqs.md index 8c840be45..1de3befb2 100644 --- a/docs/source/faqs.md +++ b/docs/source/faqs.md @@ -3,7 +3,7 @@ ## Version Specific FAQs - [[v0.7.3.post1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1007) -- [[v0.9.0rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1115) +- [[v0.9.0rc2] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1115) ## General FAQs @@ -69,14 +69,14 @@ If all above steps are not working, feel free to submit a GitHub issue. ### 7. How does vllm-ascend perform? -Currently, only some models are improved. Such as `Qwen2 VL`, `Deepseek V3`. Others are not good enough. In the future, we will support graph mode and custom ops to improve the performance of vllm-ascend. And when the official release of vllm-ascend is released, you can install `mindie-turbo` with `vllm-ascend` to speed up the inference as well. +Currently, only some models are improved. Such as `Qwen2 VL`, `Deepseek V3`. Others are not good enough. From 0.9.0rc2, Qwen and Deepseek works with graph mode to play a good performance. What's more, you can install `mindie-turbo` with `vllm-ascend v0.7.3` to speed up the inference as well. ### 8. How vllm-ascend work with vllm? vllm-ascend is a plugin for vllm. Basically, the version of vllm-ascend is the same as the version of vllm. For example, if you use vllm 0.7.3, you should use vllm-ascend 0.7.3 as well. For main branch, we will make sure `vllm-ascend` and `vllm` are compatible by each commit. ### 9. Does vllm-ascend support Prefill Disaggregation feature? -Currently, only 1P1D is supported by vllm. For vllm-ascend, it'll be done by [this PR](https://github.com/vllm-project/vllm-ascend/pull/432). For NPND, vllm is not stable and fully supported yet. We will make it stable and supported by vllm-ascend in the future. +Currently, only 1P1D is supported on V0 Engine. For V1 Engine or NPND support, We will make it stable and supported by vllm-ascend in the future. ### 10. Does vllm-ascend support quantization method? @@ -84,9 +84,7 @@ Currently, w8a8 quantization is already supported by vllm-ascend originally on v ### 11. How to run w8a8 DeepSeek model? -Currently, w8a8 DeepSeek is working in process: [support AscendW8A8 quantization](https://github.com/vllm-project/vllm-ascend/pull/511) - -Please run DeepSeek with BF16 now, following the [Multi-Node DeepSeek inferencing tutorail](https://vllm-ascend.readthedocs.io/en/main/tutorials/multi_node.html) +Please following the [quantization inferencing tutorail](https://vllm-ascend.readthedocs.io/en/main/tutorials/multi_npu_quantization.html) and replace model to DeepSeek. ### 12. There is not output in log when loading models using vllm-ascend, How to solve it? @@ -115,3 +113,13 @@ In scenarios where NPUs have limited HBM (High Bandwidth Memory) capacity, dynam - **Adjust `--gpu-memory-utilization`**: If unspecified, will use the default value of `0.9`. You can decrease this param to reserve more memory to reduce fragmentation risks. See more note in: [vLLM - Inference and Serving - Engine Arguments](https://docs.vllm.ai/en/latest/serving/engine_args.html#vllm.engine.arg_utils-_engine_args_parser-cacheconfig). - **Configure `PYTORCH_NPU_ALLOC_CONF`**: Set this environment variable to optimize NPU memory management. For example, you can `export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True` to enable virtual memory feature to mitigate memory fragmentation caused by frequent dynamic memory size adjustments during runtime, see more note in: [PYTORCH_NPU_ALLOC_CONF](https://www.hiascend.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html). + +### 15. Failed to enable NPU graph mode when running DeepSeek? +You may encounter the following error if running DeepSeek with NPU graph mode enabled. The allowed number of queries per kv when enabling both MLA and Graph mode only support {32, 64, 128}, **Thus this is not supported for DeepSeek-V2-Lite**, as it only has 16 attention heads. The NPU graph mode support on DeepSeek-V2-Lite will be done in the future. + +And if you're using DeepSeek-V3 or DeepSeek-R1, please make sure after the tensor parallel split, num_heads / num_kv_heads in {32, 64, 128}. + +```bash +[rank0]: RuntimeError: EZ9999: Inner Error! +[rank0]: EZ9999: [PID: 62938] 2025-05-27-06:52:12.455.807 numHeads / numKvHeads = 8, MLA only support {32, 64, 128}.[FUNC:CheckMlaAttrs][FILE:incre_flash_attention_tiling_check.cc][LINE:1218] +``` diff --git a/docs/source/installation.md b/docs/source/installation.md index 8fbe1825d..c9f684491 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -78,17 +78,17 @@ source vllm-ascend-env/bin/activate pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple attrs 'numpy<2.0.0' decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions # Download and install the CANN package. -wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run chmod +x ./Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run ./Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run --full source /usr/local/Ascend/ascend-toolkit/set_env.sh -wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run chmod +x ./Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run ./Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run --install -wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run chmod +x ./Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run ./Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run --install diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md index 51b8e5431..d4756ef5e 100644 --- a/docs/source/user_guide/additional_config.md +++ b/docs/source/user_guide/additional_config.md @@ -40,11 +40,13 @@ The details of each config option are as follows: | Name | Type | Default | Description | | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable torchair graph mode | +| `enable_multistream_mla`| bool | `False` | Whether to put vector ops of MLA to another stream | +| `enable_multistream_moe`| bool | `False` | Whether to enable multistream shared expert | | `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization | | `use_cached_graph` | bool | `False` | Whether to use cached graph | | `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache | | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty | -| `enable_multistream_shared_expert`| bool | `False` | Whether to enable multistream shared expert | +| `enable_kv_nz`| bool | `False` | Whether to enable kvcache NZ layout | **ascend_scheduler_config** @@ -52,7 +54,7 @@ The details of each config option are as follows: | ---- | ---- | ------- | ----------- | | `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine| -ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `chunked_prefill_enabled: true` to ascend_scheduler_config as well. +ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well. ### Example @@ -61,17 +63,18 @@ A full example of additional configuration is as follows: ``` { "torchair_graph_config": { - "enabled": true, - "use_cached_graph": true, + "enabled": True, + "use_cached_graph": True, "graph_batch_sizes": [1, 2, 4, 8], - "graph_batch_sizes_init": false, - "enable_multistream_shared_expert": false + "graph_batch_sizes_init": False, + "enable_multistream_moe": False, + "enable_kv_nz": False }, "ascend_scheduler_config": { - "enabled": true, - "chunked_prefill_enabled": true, + "enabled": True, + "enable_chunked_prefill": True, }, "expert_tensor_parallel_size": 1, - "refresh": false, + "refresh": False, } ``` diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md index 2bd83ffe7..38d356249 100644 --- a/docs/source/user_guide/graph_mode.md +++ b/docs/source/user_guide/graph_mode.md @@ -23,7 +23,7 @@ import os from vllm import LLM -os.environ["VLLM_USE_V1"] = 1 +os.environ["VLLM_USE_V1"] = "1" model = LLM(model="Qwen/Qwen2-7B-Instruct") outputs = model.generate("Hello, how are you?") @@ -45,16 +45,17 @@ offline example: import os from vllm import LLM -os.environ["VLLM_USE_V1"] = 1 +os.environ["VLLM_USE_V1"] = "1" -model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enable": True}}) +# TorchAirGraph is only work without chunked-prefill now +model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True,}}) outputs = model.generate("Hello, how are you?") ``` online example: ```shell -vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enable": True}}' +vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True,}}' ``` You can find more detail about additional config [here](./additional_config.md) @@ -69,7 +70,7 @@ offline example: import os from vllm import LLM -os.environ["VLLM_USE_V1"] = 1 +os.environ["VLLM_USE_V1"] = "1" model = LLM(model="someother_model_weight", enforce_eager=True) outputs = model.generate("Hello, how are you?") diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md index 42a944f44..8f72b937e 100644 --- a/docs/source/user_guide/release_notes.md +++ b/docs/source/user_guide/release_notes.md @@ -1,5 +1,13 @@ # Release note +## v0.9.0rc2 - 2025.06.10 + +This release contains some quick fixes for v0.9.0rc1. Please use this release instead of v0.9.0rc1. + +### Highlights + +- Fix the import error when vllm-ascend is installed without editable way. [#1152](https://github.com/vllm-project/vllm-ascend/pull/1152) + ## v0.9.0rc1 - 2025.06.09 This is the 1st release candidate of v0.9.0 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. From this release, V1 Engine is recommended to use. The code of V0 Engine is frozen and will not be maintained any more. Please set environment `VLLM_USE_V1=1` to enable V1 Engine. diff --git a/examples/disaggregated_prefill/find_device_ips.py b/examples/disaggregated_prefill/find_device_ips.py index 205afbf78..48fd7b9d3 100644 --- a/examples/disaggregated_prefill/find_device_ips.py +++ b/examples/disaggregated_prefill/find_device_ips.py @@ -30,38 +30,40 @@ HCCN_TOOL_PATH = envs.HCCN_PATH -def get_device_ips(world_size: int): - npu_info = subprocess.run( - ["npu-smi", "info", "-m"], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - ) +def get_device_ips(): + npu_info = subprocess.run(['npu-smi', 'info', '-m'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True) if npu_info.returncode != 0 or not os.path.exists(HCCN_TOOL_PATH): raise RuntimeError("No npu-smi/hccn_tool tools provided for NPU.") - npu_start_idx = int( - re.match(r".*\n\t([0-9]+).*", - npu_info.stdout).group(1)) # type: ignore + + # ‌Extract NPU IDs for all Ascend devices (excluding Mcu rows) + device_ids = [] + for line in npu_info.stdout.strip().split('\n'): + match = re.match(r'^\s*(\d+)\s+\d+\s+\d+\s+Ascend', line) + if match: + device_ids.append(int(match.group(1))) + + if not device_ids: + raise RuntimeError( + "Cannot parse any valid device ID from npu-smi output.") + device_ip_list = [] - for ip_offset in range(world_size): - cmd = [ - HCCN_TOOL_PATH, - "-i", - f"{npu_start_idx + ip_offset}", - "-ip", - "-g", - ] - device_ip_info = subprocess.run( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True, - ) - device_ip = re.match(r"ipaddr:(.*)\n", - device_ip_info.stdout).group(1) # type: ignore + for device_id in device_ids: + cmd = [HCCN_TOOL_PATH, '-i', str(device_id), '-ip', '-g'] + device_ip_info = subprocess.run(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True) + ip_match = re.search(r'ipaddr:(.*)', device_ip_info.stdout) + if not ip_match: + raise RuntimeError( + f"Cannot parse IP from hccn_tool for device {device_id}") + device_ip = ip_match.group(1).strip() device_ip_list.append(device_ip) + return device_ip_list -# Pass number of NPUs into this function. -print(get_device_ips(8)) +print(get_device_ips()) diff --git a/examples/offline_multi_step_custom_ops.py b/examples/offline_multi_step_custom_ops.py index 82a1bf575..59c7fafcc 100644 --- a/examples/offline_multi_step_custom_ops.py +++ b/examples/offline_multi_step_custom_ops.py @@ -19,9 +19,6 @@ from vllm import LLM, SamplingParams -import vllm_ascend.platform as pf - -pf.CUSTOM_OP_ENABLED = True # set True for custom Ops of Multi-Step. prompts = [ "Hello, my name is", "The president of the United States is", diff --git a/examples/run_dp_attention_etp16.sh b/examples/run_dp_attention_etp16.sh new file mode 100644 index 000000000..b73649241 --- /dev/null +++ b/examples/run_dp_attention_etp16.sh @@ -0,0 +1,23 @@ +export VLLM_ENABLE_MC2=0 +export VLLM_USE_V1=1 +export TASK_QUEUE_ENABLE=1 +source /usr/local/Ascend/ascend-toolkit/set_env.sh +source /usr/local/Ascend/nnal/atb/set_env.sh +export ASCEND_LAUNCH_BLOCKING=0 +export VLLM_VERSION=0.9.0 + +nohup python -m vllm.entrypoints.openai.api_server --model=/mnt/deepseek/DeepSeek-R1-W8A8-VLLM \ + --quantization ascend \ + --trust-remote-code \ + --distributed-executor-backend=mp \ + --port 8006 \ + -tp=8 \ + -dp=2 \ + --max-num-seqs 24 \ + --max-model-len 32768 \ + --max-num-batched-tokens 32768 \ + --block-size 128 \ + --no-enable-prefix-caching \ + --additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes":[24]},"ascend_scheduler_config":{"enabled":true},"expert_tensor_parallel_size":16}' \ + --gpu-memory-utilization 0.96 &> run.log & +disown \ No newline at end of file diff --git a/examples/run_dp_attention_etp16_benmark.sh b/examples/run_dp_attention_etp16_benmark.sh new file mode 100644 index 000000000..ab72b3b28 --- /dev/null +++ b/examples/run_dp_attention_etp16_benmark.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# Concurrency array +concurrency_array=(48) +#best rate +rate_array=(0.7) + +# Result file +result_file="benchmark_results.txt" +echo "Benchmark Results" > $result_file +echo "===================" >> $result_file + +# Loop through all combinations +for concurrency in "${concurrency_array[@]}"; do + for rate in "${rate_array[@]}"; do + echo "Testing with concurrency=$concurrency, rate=$rate" + echo "" >> $result_file + echo "Concurrency: $concurrency, Request Rate: $rate" >> $result_file + echo "-------------------" >> $result_file + + # Run benchmark test + python /mnt/deepseek/vllm/benchmarks/benchmark_serving.py \ + --backend vllm \ + --trust-remote-code \ + --model /mnt/deepseek/DeepSeek-R1-W8A8-VLLM \ + --dataset-name random \ + --random-input-len 4096 \ + --random-output-len 1536 \ + --ignore-eos \ + --num-prompts 400 \ + --max-concurrency $concurrency \ + --request-rate $rate \ + --metric-percentiles 90 \ + --base-url http://localhost:8006 2>&1 | tee -a $result_file + + # Wait for system cool down + sleep 30 + done +done + +# Analyze results +echo "Analysis Results" > analysis_results.txt +echo "=================" >> analysis_results.txt + +# Extract and analyze TPOT data +echo "TPOT Analysis:" >> analysis_results.txt +grep "Mean TPOT" $result_file | awk -F':' '{ + printf "Concurrency %s, Rate %s: %s ms\n", $1, $2, $3 +}' >> analysis_results.txt + +# Extract and analyze throughput data +echo -e "\nThroughput Analysis:" >> analysis_results.txt +grep "Output token throughput" $result_file | awk -F':' '{ + printf "Concurrency %s, Rate %s: %s tokens/s\n", $1, $2, $3 +}' >> analysis_results.txt + +echo "Testing completed. Results saved in $result_file and analysis in analysis_results.txt" diff --git a/mypy.ini b/mypy.ini index 72b03de21..6fe8e6c29 100644 --- a/mypy.ini +++ b/mypy.ini @@ -6,6 +6,9 @@ warn_unused_configs = True [mypy-torch_npu.*] ignore_missing_imports = True +[mypy-torchair.*] +ignore_missing_imports = True + [mypy-transformers.*] ignore_missing_imports = True diff --git a/tests/conftest.py b/tests/conftest.py index 16bbc8027..e0d70a19d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,6 +17,7 @@ # Adapted from vllm-project/vllm/blob/main/tests/conftest.py # +import contextlib import gc from typing import List, Optional, Tuple, TypeVar, Union @@ -53,11 +54,17 @@ PromptVideoInput = _PromptMultiModalInput[np.ndarray] -def cleanup_dist_env_and_memory(): +def cleanup_dist_env_and_memory(shutdown_ray: bool = False): destroy_model_parallel() destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + if shutdown_ray: + import ray # Lazy import Ray + ray.shutdown() gc.collect() torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() class VllmRunner: diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py index dc02c4b97..f5ec2c872 100644 --- a/tests/multicard/test_offline_inference_distributed.py +++ b/tests/multicard/test_offline_inference_distributed.py @@ -23,7 +23,7 @@ import os from unittest.mock import patch -import vllm # noqa: F401 +from modelscope import snapshot_download # type: ignore from vllm import SamplingParams from tests.conftest import VllmRunner @@ -95,3 +95,20 @@ def test_models_distributed_DeepSeek_dbo(): distributed_executor_backend="mp", ) as vllm_model: vllm_model.generate(example_prompts, sampling_params) + + +def test_models_distributed_DeepSeek_W8A8(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + + with VllmRunner( + snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"), + max_model_len=8192, + enforce_eager=True, + dtype="auto", + tensor_parallel_size=4, + quantization="ascend", + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/multicard/test_torchair_graph_mode.py b/tests/multicard/test_torchair_graph_mode.py new file mode 100644 index 000000000..d06ec7de2 --- /dev/null +++ b/tests/multicard/test_torchair_graph_mode.py @@ -0,0 +1,80 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +"""Compare the short outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/multicard/test_torchair_graph_mode.py`. +""" +import os + +import pytest + +from tests.conftest import VllmRunner + +os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" + + +@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", + reason="torchair graph is not supported on v0") +def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_MODELSCOPE", "True") + m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + + example_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + dtype = "half" + max_tokens = 5 + # torchair is only work without chunked-prefill now + with VllmRunner( + "vllm-ascend/DeepSeek-V3-Pruning", + dtype=dtype, + tensor_parallel_size=4, + distributed_executor_backend="mp", + additional_config={ + "torchair_graph_config": { + "enabled": True, + }, + "ascend_scheduler_config": { + "enabled": True, + }, + "refresh": True, + }, + enforce_eager=False, + ) as vllm_model: + # use greedy sampler to make sure the generated results are fix + vllm_output = vllm_model.generate_greedy(example_prompts, + max_tokens) + # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of + # DeepSeek-V3 with 2 hidden layers, thus the golden results seems + # inaccurate. This will only change if accuracy improves with the + # official weights of DeepSeek-V3. + golden_results = [ + 'Hello, my name is feasibility伸 spazio debtor添', + 'The president of the United States is begg"""\n杭州风和 bestimm', + 'The capital of France is frequentlyশามalinkAllowed', + 'The future of AI is deleting俯احت怎么样了حراف', + ] + + assert len(golden_results) == len(vllm_output) + for i in range(len(vllm_output)): + assert golden_results[i] == vllm_output[i][1] + print(f"Generated text: {vllm_output[i][1]!r}") diff --git a/tests/ops/test_vocabparallelembedding.py b/tests/ops/test_vocabparallelembedding.py new file mode 100644 index 000000000..97d6c7059 --- /dev/null +++ b/tests/ops/test_vocabparallelembedding.py @@ -0,0 +1,91 @@ +from typing import Tuple + +import pytest +import torch +import torch_npu # noqa: F401 + +import vllm_ascend.platform # noqa: F401 + +# Test parameters +DTYPES = [torch.int32] +#SHAPES = [(100,), (5, 20), (3, 4, 5)] # Various tensor shapes +#SHAPES = [(3, 4, 8), (3, 4, 5)] # Various tensor shapes +SHAPES = [(3, 4, 3)] +DEVICES = [f"npu:{0}"] +SEEDS = [0] + + +def get_masked_input_and_mask_ref( + input_: torch.Tensor, org_vocab_start_index: int, + org_vocab_end_index: int, num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Reference implementation for verification""" + org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < + org_vocab_end_index) + added_vocab_mask = (input_ >= added_vocab_start_index) & ( + input_ < added_vocab_end_index) + added_offset = added_vocab_start_index - ( + org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding + valid_offset = (org_vocab_start_index * + org_vocab_mask) + (added_offset * added_vocab_mask) + vocab_mask = org_vocab_mask | added_vocab_mask + masked_input = vocab_mask * (input_ - valid_offset) + return masked_input, ~vocab_mask + + +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEEDS) +@torch.inference_mode() +def test_get_masked_input_and_mask( + shape: Tuple[int, ...], + dtype: torch.dtype, + device: str, + seed: int, +) -> None: + # Set random seed + torch.manual_seed(seed) + torch.set_default_device(device) + + # Generate random input tensor + input_tensor = torch.randint(0, 1000, shape, dtype=dtype) + + # Test parameters + test_case = { + "org_start": 100, + "org_end": 200, + "padding": 0, + "added_start": 300, + "added_end": 400, + } + + # Get reference result + ref_masked_input, ref_mask = get_masked_input_and_mask_ref( + input_tensor, test_case["org_start"], test_case["org_end"], + test_case["padding"], test_case["added_start"], test_case["added_end"]) + + # Get custom op result + print("input_tensor:", input_tensor) + custom_masked_input, custom_mask = torch.ops._C.get_masked_input_and_mask( + input_tensor, test_case["org_start"], test_case["org_end"], + test_case["padding"], test_case["added_start"], test_case["added_end"]) + + ref_masked_input = ref_masked_input.to(dtype) + print("custom_masked_input:", custom_masked_input) + print("ref_masked_input:", ref_masked_input) + print("custom_mask:", custom_mask) + print("ref_mask:", ref_mask) + # Compare results + torch.testing.assert_close( + custom_masked_input, + ref_masked_input, + rtol=1e-5, + atol=1e-5, + msg=f"Masked input mismatch for case: {test_case}") + torch.testing.assert_close(custom_mask, + ref_mask, + rtol=1e-5, + atol=1e-5, + msg=f"Mask mismatch for case: {test_case}") diff --git a/tests/singlecard/compile/test_simple.py b/tests/singlecard/compile/test_simple.py index 64d4cba67..70b89297a 100644 --- a/tests/singlecard/compile/test_simple.py +++ b/tests/singlecard/compile/test_simple.py @@ -14,8 +14,6 @@ set_current_vllm_config) from vllm.utils import direct_register_custom_op -from vllm_ascend.utils import vllm_version_is - global_counter = 0 # create a library to hold the custom op @@ -93,28 +91,14 @@ def test_simple_piecewise_compile(): model = SillyModel(vllm_config=vllm_config, prefix="") inputs = torch.randn(100).npu() - - if vllm_version_is("0.9.0"): - kwargs = { - "num_graphs_seen": 1, # one graph for the model - "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 - "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers - "num_backend_compilations": - 3, # num_piecewise_capturable_graphs_seen - "num_cudagraph_caputured": - 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - } - else: - kwargs = { - "num_graphs_seen": 1, # one graph for the model - "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 - "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers - "num_backend_compilations": - 3, # num_piecewise_capturable_graphs_seen - "num_cudagraph_captured": - 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen - } - + kwargs = { + "num_graphs_seen": 1, # one graph for the model + "num_piecewise_graphs_seen": 5, # 2 * num_layers + 1 + "num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers + "num_backend_compilations": 3, # num_piecewise_capturable_graphs_seen + "num_cudagraph_captured": + 6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + } with compilation_counter.expect(kwargs): model(inputs) diff --git a/tests/singlecard/core/__init__.py b/tests/singlecard/core/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/singlecard/core/test_ascend_scheduler.py b/tests/singlecard/core/test_ascend_scheduler.py new file mode 100644 index 000000000..c382ebdf4 --- /dev/null +++ b/tests/singlecard/core/test_ascend_scheduler.py @@ -0,0 +1,792 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import pytest +import torch +from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, + SchedulerConfig, SpeculativeConfig, VllmConfig) +from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange +from vllm.sampling_params import SamplingParams +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec) +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.request import Request, RequestStatus +from vllm.v1.structured_output import StructuredOutputManager + +from vllm_ascend.core.scheduler import AscendScheduler +from vllm_ascend.utils import vllm_version_is + +EOS_TOKEN_ID = 50256 + + +def create_scheduler( + model: str = "Qwen/Qwen2.5-0.5B-Instruct", + max_num_seqs: int = 16, + max_num_batched_tokens: int = 8192, + enable_prefix_caching: Optional[bool] = None, + long_prefill_token_threshold: int = 0, + disable_chunked_mm_input: bool = False, + use_kv_connector: bool = False, + num_blocks: int = 10000, + block_size: int = 16, + max_model_len: Optional[int] = None, + num_speculative_tokens: Optional[int] = None, + enable_chunked_prefill: bool = False, +) -> AscendScheduler: + '''Create scheduler under test. + + Args: + model: model under test + max_num_seqs: max sequences to schedule + max_num_batch_tokens: max num tokens to batch + enable_prefix_caching: optionally force APC config + (True/False) or use default + (None) + + Returns: + {class}`Scheduler` instance + ''' + if max_model_len is None: + max_model_len = max_num_batched_tokens + scheduler_config = SchedulerConfig( + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + max_model_len=max_model_len, + long_prefill_token_threshold=long_prefill_token_threshold, + disable_chunked_mm_input=disable_chunked_mm_input, + enable_chunked_prefill=enable_chunked_prefill, + ) + model_config = ModelConfig( + model=model, + task="auto", + tokenizer=model, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="float16", + seed=42, + ) + # Cache config, optionally force APC + kwargs_cache = ({} if enable_prefix_caching is None else { + 'enable_prefix_caching': enable_prefix_caching + }) + cache_config = CacheConfig( + block_size=block_size, + gpu_memory_utilization=0.9, + swap_space=0, + cache_dtype="auto", + **kwargs_cache, + ) + kv_transfer_config = KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={"shared_storage_path": "local_storage"}, + ) if use_kv_connector else None + + speculative_config: Optional[SpeculativeConfig] = None + if num_speculative_tokens is not None: + speculative_config = SpeculativeConfig( + model="ngram", num_speculative_tokens=num_speculative_tokens) + + vllm_config = VllmConfig( + scheduler_config=scheduler_config, + model_config=model_config, + cache_config=cache_config, + kv_transfer_config=kv_transfer_config, + speculative_config=speculative_config, + ) + kv_cache_config = KVCacheConfig( + num_blocks=num_blocks, # A large number of blocks to hold all requests + **({ + "tensors": {} + } if vllm_version_is("0.9.0") else { + "kv_cache_tensors": [] + }), + kv_cache_groups=[ + KVCacheGroupSpec(['layer'], + FullAttentionSpec(block_size, 1, 1, torch.float32, + False)) + ], + ) + cache_config.num_gpu_blocks = num_blocks + return AscendScheduler( + vllm_config=vllm_config, + kv_cache_config=kv_cache_config, + log_stats=True, + structured_output_manager=StructuredOutputManager(vllm_config), + ) + + +def create_requests(num_requests: int, + num_tokens: int = 10, + mm_positions: Optional[list[PlaceholderRange]] = None, + max_tokens: int = 16, + stop_token_ids: Optional[list[int]] = None, + prompt_logprobs: Optional[int] = None): + sampling_params = SamplingParams(ignore_eos=False, + max_tokens=max_tokens, + stop_token_ids=stop_token_ids, + prompt_logprobs=prompt_logprobs) + requests = [] + for i in range(num_requests): + if mm_positions is not None: + mm_position = mm_positions[i] + mm_inputs = [MultiModalKwargs({})] * len(mm_position) + else: + mm_position = None + mm_inputs = None + request = Request( + request_id=f"{i}", + prompt_token_ids=[i] * num_tokens, + sampling_params=sampling_params, + multi_modal_inputs=mm_inputs, + multi_modal_placeholders=mm_position, + multi_modal_hashes=None, + eos_token_id=EOS_TOKEN_ID, + **({ + "arrival_time": 0.0 + } if vllm_version_is("0.9.0") else {}), + ) + requests.append(request) + return requests + + +def test_add_requests(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + + for i, request in enumerate(requests): + scheduler.add_request(request) + assert request.request_id in scheduler.requests + assert len(scheduler.waiting) == i + 1 + + +def test_finish_request(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + for i, request in enumerate(requests): + scheduler.finish_requests(request.request_id, + RequestStatus.FINISHED_ABORTED) + assert request.request_id not in scheduler.requests + assert len(scheduler.waiting) == 9 - i + + +def test_get_num_unfinished_requests(): + scheduler = create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + for i, request in enumerate(requests): + scheduler.finish_requests(request.request_id, + RequestStatus.FINISHED_STOPPED) + assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1 + + +@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [ + (None, None), + (True, 5), +]) +def test_schedule(enable_prefix_caching: Optional[bool], + prompt_logprobs: Optional[int]): + '''Test scheduling. + Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs + ''' + scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching) + requests = create_requests(num_requests=10, + prompt_logprobs=prompt_logprobs) + for request in requests: + scheduler.add_request(request) + + # Test initial scheduling + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == len(requests) + assert len(output.scheduled_cached_reqs) == 0 + assert len(output.finished_req_ids) == 0 + # Verify all requests are scheduled. + for req_id, num_tokens in output.num_scheduled_tokens.items(): + assert num_tokens == len(requests[int(req_id)].prompt_token_ids) + + # Verify requests moved from waiting to running + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == len(requests) + for i, request in enumerate(requests): + assert scheduler.running[i] == request + + +@pytest.mark.parametrize("enable_prefix_caching", [True, False]) +def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): + """Test scheduling behavior with concurrent partial requests. + + This test verifies that: there are multiple long prefill requests in the + RUNNING state, and we can schedule them together. + + """ + scheduler = create_scheduler( + model="facebook/opt-125m", + max_num_batched_tokens=1024, + long_prefill_token_threshold=400, + enable_prefix_caching=enable_prefix_caching, + enable_chunked_prefill=True, + ) + requests = create_requests( + num_requests=3, + num_tokens=800, + ) + for request in requests: + scheduler.add_request(request) + + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == 3 + assert len(output.scheduled_cached_reqs) == 0 + assert len(output.finished_req_ids) == 0 + + # The first request is scheduled partially - 400. + assert output.num_scheduled_tokens[requests[0].request_id] == 400 + # The second request is scheduled partially - 400. + assert output.num_scheduled_tokens[requests[1].request_id] == 400 + # The third request is also scheduled partially - 1024 - 400 - 400 = 224. + assert output.num_scheduled_tokens[requests[2].request_id] == 224 + req_to_index = { + request.request_id: i + for i, request in enumerate(requests) + } + model_runner_output = ModelRunnerOutput( + req_ids=[request.request_id for request in requests], + req_id_to_index=req_to_index, + sampled_token_ids=[[] for _ in range(len(requests))], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + scheduler.update_from_output(output, model_runner_output) + + # Schedule the next step. All three requests are running. + # Processed the remaining prefills of the first and second requests. + output1 = scheduler.schedule() + assert len(scheduler.running) == 3 + assert len(output1.scheduled_new_reqs) == 0 + assert len(output1.scheduled_cached_reqs) == 3 + assert len(output1.finished_req_ids) == 0 + assert output1.num_scheduled_tokens[requests[0].request_id] == 400 + assert output1.num_scheduled_tokens[requests[1].request_id] == 400 + assert output1.num_scheduled_tokens[requests[2].request_id] == 224 + + # Schedule the third step. All three requests are running. + # First and second requests are in the decode stage. + # All the remaining tokens in the third request are processed. + model_runner_output = ModelRunnerOutput( + req_ids=[request.request_id for request in requests], + req_id_to_index=req_to_index, + sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + scheduler.update_from_output(output1, model_runner_output) + output2 = scheduler.schedule() + assert len(scheduler.running) == 3 + assert len(output2.scheduled_new_reqs) == 0 + assert len(output2.scheduled_cached_reqs) == 3 + assert len(output2.finished_req_ids) == 0 + assert output2.num_scheduled_tokens[requests[0].request_id] == 1 + assert output2.num_scheduled_tokens[requests[1].request_id] == 1 + assert output2.num_scheduled_tokens[ + requests[2].request_id] == 800 - 224 - 224 + + +def test_stop_via_update_from_output(): + """Test stopping behavior through update_from_output""" + scheduler = create_scheduler(num_speculative_tokens=1) + + # Test case 1: Stop on EOS token + requests = create_requests(num_requests=2, max_tokens=10) + for req in requests: + req.num_computed_tokens = req.num_tokens + scheduler.requests[req.request_id] = req + scheduler.running.append(req) + + scheduler_output = SchedulerOutput(scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 1, + requests[1].request_id: 2 + }, + total_num_scheduled_tokens=3, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [], + requests[1].request_id: [10] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={req.request_id: i + for i, req in enumerate(requests)}, + sampled_token_ids=[[EOS_TOKEN_ID], + [10, + 11]], # First request hits EOS, second continues + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify first request stopped, second continues + assert len(scheduler.running) == 1 + assert scheduler.running[0].request_id == requests[1].request_id + assert requests[0].status == RequestStatus.FINISHED_STOPPED + assert requests[0].request_id in scheduler.finished_req_ids + assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID] + assert list(requests[1].output_token_ids) == [10, 11] + + # Test case 2: Stop on custom stop token + scheduler = create_scheduler(num_speculative_tokens=2) + requests = create_requests(num_requests=2, + max_tokens=10, + stop_token_ids=[42, 43]) + for req in requests: + req.num_computed_tokens = req.num_tokens + scheduler.requests[req.request_id] = req + scheduler.running.append(req) + + scheduler_output = SchedulerOutput(scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 3, + requests[1].request_id: 2 + }, + total_num_scheduled_tokens=5, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [10, 42], + requests[1].request_id: [13] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={req.request_id: i + for i, req in enumerate(requests)}, + sampled_token_ids=[[10, 42, 12], + [13, 14]], # First request hits stop token + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify first request stopped on custom token + assert len(scheduler.running) == 1 + assert scheduler.running[0].request_id == requests[1].request_id + assert requests[0].status == RequestStatus.FINISHED_STOPPED + assert requests[0].stop_reason == 42 + assert requests[0].request_id in scheduler.finished_req_ids + assert list(requests[0].output_token_ids) == [10, 42] + assert list(requests[1].output_token_ids) == [13, 14] + + # Test case 3: Stop on max tokens + scheduler = create_scheduler(num_speculative_tokens=2) + requests = create_requests(num_requests=2, max_tokens=2) + for req in requests: + req.num_computed_tokens = req.num_tokens + scheduler.requests[req.request_id] = req + scheduler.running.append(req) + + scheduler_output = SchedulerOutput(scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 3, + requests[1].request_id: 1 + }, + total_num_scheduled_tokens=4, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [10, 11], + requests[1].request_id: [] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={req.request_id: i + for i, req in enumerate(requests)}, + sampled_token_ids=[[10, 11, 12], + [13]], # First request exceeds max_tokens + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify first request stopped due to length + assert len(scheduler.running) == 1 + assert scheduler.running[0].request_id == requests[1].request_id + assert requests[0].status == RequestStatus.FINISHED_LENGTH_CAPPED + assert requests[0].request_id in scheduler.finished_req_ids + assert list(requests[0].output_token_ids) == [10, 11 + ] # Truncated to max_tokens + assert list(requests[1].output_token_ids) == [13] + + # Test case 4: Ignore EOS flag + scheduler = create_scheduler(num_speculative_tokens=2) + requests = create_requests(num_requests=1, max_tokens=10) + requests[0].sampling_params.ignore_eos = True + requests[0].num_computed_tokens = requests[0].num_tokens + scheduler.requests[requests[0].request_id] = requests[0] + scheduler.running.append(requests[0]) + + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={requests[0].request_id: 3}, + total_num_scheduled_tokens=3, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [EOS_TOKEN_ID, 10] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + + model_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify request continues past EOS + assert len(scheduler.running) == 1 + assert not requests[0].is_finished() + assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11] + + +@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [ + (None, None), + (True, 5), +]) +def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool], + prompt_logprobs: Optional[int]): + scheduler = create_scheduler( + max_num_batched_tokens=1024, + max_num_seqs=2, + enable_prefix_caching=enable_prefix_caching, + enable_chunked_prefill=True, + ) + requests = create_requests( + num_requests=2, + num_tokens=512, + prompt_logprobs=prompt_logprobs, + ) + + # Schedule the first request. + scheduler.add_request(requests[0]) + scheduler_output0 = scheduler.schedule() + assert len(scheduler_output0.scheduled_new_reqs) == 1 + assert scheduler_output0.num_scheduled_tokens[ + requests[0].request_id] == 512 + + # The first request is still running, so only schedule the second request. + scheduler.add_request(requests[1]) + scheduler_output1 = scheduler.schedule() + assert len(scheduler_output1.scheduled_new_reqs) == 1 + assert scheduler_output1.num_scheduled_tokens[ + requests[1].request_id] == 512 + + # Model output of the first request. + model_runner_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[0]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + scheduler.update_from_output(scheduler_output0, model_runner_output) + + # Schedule the next step. + # The first request can be scheduled again while the second + # request is still running. + scheduler_output2 = scheduler.schedule() + assert scheduler_output2.num_scheduled_tokens[requests[0].request_id] == 1 + + # Model output of the second request. + model_runner_output = ModelRunnerOutput( + req_ids=[requests[1].request_id], + req_id_to_index={requests[1].request_id: 0}, + sampled_token_ids=[[0]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + scheduler.update_from_output(scheduler_output1, model_runner_output) + + +# Note - these test cases mirror some of those in test_rejection_sampler.py +@pytest.mark.parametrize( + "spec_tokens,output_tokens,expected", + [ + ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])), # perfect match + ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])), # early mismatch + ([[1, 2], [3]], [[1, 2, 5], [3, 4]], + (2, 3, 3, [2, 1])), # multiple sequences + ([[1]], [[1, 2]], (1, 1, 1, [1])), # single token sequence + ([[]], [[5]], (0, 0, 0, [0])), # empty sequence + ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]], + (2, 6, 3, [2, 1, 0])), # multiple mismatches + ]) +def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): + """Test scheduling behavior with speculative decoding. + + This test verifies that: + 1. Speculated tokens get scheduled correctly + 2. Spec decoding stats properly count number of draft and accepted tokens + """ + if vllm_version_is("0.9.0"): + return + num_spec_tokens = max(1, max(len(t) for t in spec_tokens)) + scheduler = create_scheduler(num_speculative_tokens=num_spec_tokens) + requests = create_requests(num_requests=len(spec_tokens), num_tokens=1) + req_ids = [] + req_to_index = {} + for i, request in enumerate(requests): + scheduler.add_request(request) + req_ids.append(request.request_id) + req_to_index[request.request_id] = i + + # Schedule a decode, which will also draft speculative tokens + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == len(requests) + assert output.total_num_scheduled_tokens == len(requests) + for i in range(len(requests)): + req_id = requests[i].request_id + assert output.num_scheduled_tokens[req_id] == 1 + assert req_id not in output.scheduled_spec_decode_tokens + + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[0] for _ in range(len(requests))], + spec_token_ids=spec_tokens, + logprobs=None, + prompt_logprobs_dict={}, + ) + engine_core_outputs = scheduler.update_from_output(output, + model_runner_output) + + for i in range(len(requests)): + running_req = scheduler.running[i] + # The prompt token + assert running_req.num_computed_tokens == 1 + # The prompt token and the sampled token + assert running_req.num_tokens == 2 + # The prompt token, the sampled token, and the speculated tokens + assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i]) + + # No draft or accepted tokens counted yet + assert not engine_core_outputs or ( + engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None) + + # Schedule the speculated tokens for validation + output = scheduler.schedule() + assert len(output.scheduled_new_reqs) == 0 + # The sampled token and speculated tokens + assert output.total_num_scheduled_tokens == \ + len(requests) + sum(len(ids) for ids in spec_tokens) + for i in range(len(requests)): + req_id = requests[i].request_id + assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i]) + if spec_tokens[i]: + assert len(output.scheduled_spec_decode_tokens[req_id]) == \ + len(spec_tokens[i]) + else: + assert req_id not in output.scheduled_spec_decode_tokens + + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=output_tokens, + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + engine_core_outputs = scheduler.update_from_output(output, + model_runner_output) + + scheduler_stats = engine_core_outputs[0].scheduler_stats \ + if engine_core_outputs else None + if expected[0] == 0: + assert scheduler_stats.spec_decoding_stats is None # type: ignore + else: + assert scheduler_stats.spec_decoding_stats is not None # type: ignore + stats = scheduler_stats.spec_decoding_stats # type: ignore + assert stats.num_drafts == expected[0] + assert stats.num_draft_tokens == expected[1] + assert stats.num_accepted_tokens == expected[2] + assert stats.num_accepted_tokens_per_pos == expected[3] + + +def _assert_right_scheduler_output( + output: SchedulerOutput, + num_requests: int, + expected_num_scheduled_tokens: int, +): + """Check if SchedulerOutput is correct after remote KV cache hit.""" + + # We should inject the kv_connector_metadata. + assert len(output.kv_connector_metadata.requests) == num_requests + + # Only num_tokens - matched_num_new_tokens should be scheduled. + for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): + assert num_scheduled_tokens == expected_num_scheduled_tokens + + +def _assert_right_kv_cache_manager( + scheduler: AscendScheduler, + req_ids: list[str], + num_tokens: int, + block_size: int, + num_requests: int, + num_total_blocks: int, +): + """Check whether KVCacheManager is correct after allocate.""" + + # Make sure the request stats are right. + EXPECTED_TOTAL_BLOCKS = num_tokens // block_size + for req_id in req_ids: + blocks = (scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks[req_id]) + hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id] + assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0]. + num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS) + assert len(blocks) == EXPECTED_TOTAL_BLOCKS + assert len(hashes) == EXPECTED_TOTAL_BLOCKS + + # Make sure we actually touched all the blocks. + BLOCKS_PER_REQ = num_tokens / block_size + assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == + num_total_blocks - num_requests * BLOCKS_PER_REQ) + + +def _step_until_done( + scheduler: AscendScheduler, + output: SchedulerOutput, + model_runner_output: ModelRunnerOutput, +): + """Loop over schedule(), update_from_output() until finished.""" + + all_finished = False + _ = scheduler.update_from_output(output, model_runner_output) + while not all_finished: + # Schedule + a few iterations until stopping. + output = scheduler.schedule() + assert len(scheduler.running) + for _, num_scheduled_tokens in output.num_scheduled_tokens.items(): + # We should be in the decode phase now. + assert num_scheduled_tokens == 1 + assert len(output.kv_connector_metadata.requests) == 0 + ecos = scheduler.update_from_output(output, model_runner_output)[0] + all_done = True + for eco in ecos.outputs: + if eco.finish_reason is None: + all_done = False + all_finished = all_done + + +def make_output(scheduler: AscendScheduler): + return ModelRunnerOutput( + req_ids=[req.request_id for req in scheduler.running], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(scheduler.running) + }, + sampled_token_ids=[[1000]] * len(scheduler.running), + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + + +def assert_scheduler_empty(scheduler: AscendScheduler): + """Confirm the scheduler is "empty" - i.e. no leaks.""" + # Scheduler Metadata. + assert len(scheduler.requests) == 0 + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == 0 + assert len(scheduler.finished_req_ids) == 0 + assert len(scheduler._cached_reqs_data) == 0 + + # EncoderCacheManager. + assert len(scheduler.encoder_cache_manager.freed) == 0 + assert len(scheduler.encoder_cache_manager.cached) == 0 + + # KVCache Manager. + if not vllm_version_is("0.9.0"): + assert len(scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks) == 0 + assert len(scheduler.kv_cache_manager.coordinator. + single_type_managers[0].num_cached_block) == 0 + assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0 + num_free_blocks = ( + scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) + assert num_free_blocks == ( + scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1) + + # NOTE(rob): just the ref count on blocks will be 0. The hash + # value, etc will remain since we lazily evict for prefix cache. + for block in scheduler.kv_cache_manager.block_pool.blocks: + assert block.ref_cnt == 0 + + +def test_memory_leak(): + """Test that we do not have a memory leak.""" + + scheduler = create_scheduler(enable_prefix_caching=True) + + NUM_REQUESTS = 5 + NUM_TOKENS = 10 + MAX_TOKENS = 10 + requests = create_requests(num_requests=NUM_REQUESTS, + num_tokens=NUM_TOKENS, + max_tokens=MAX_TOKENS) + + # Add each request. + for request in requests: + scheduler.add_request(request) + scheduler_output = scheduler.schedule() + model_runner_output = make_output(scheduler) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Iterate until done. + while True: + scheduler_output = scheduler.schedule() + if len(scheduler.running) == 0: + break + model_runner_output = make_output(scheduler) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Confirm no memory leak. + assert_scheduler_empty(scheduler) \ No newline at end of file diff --git a/tests/singlecard/core/test_ascend_scheduler_e2e.py b/tests/singlecard/core/test_ascend_scheduler_e2e.py new file mode 100644 index 000000000..668dafced --- /dev/null +++ b/tests/singlecard/core/test_ascend_scheduler_e2e.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os + +import pytest +from vllm import LLM + +if os.getenv("VLLM_USE_V1", "0") != "1": + pytest.skip("Test package requires V1", allow_module_level=True) + +MODEL = "Qwen/Qwen2.5-0.5B-Instruct" +PROMPT = "Hello my name is Robert and I" + + +@pytest.fixture(scope="module") +def model() -> LLM: + return LLM( + MODEL, + enforce_eager=True, + enable_prefix_caching=True, + max_num_batched_tokens=200, + max_num_seqs=3, + additional_config={"ascend_scheduler_config": { + "enabled": True, + }}) + + +def test_concurrent_partial_prefill(model): + outputs = model.generate([PROMPT] * 3) + assert len(outputs) == 3 + for output in outputs: + assert len(output.outputs) == 1 + + +def test_prefix_cache_stats_is_recorded(model): + # 17 tokens will make sure first 16 tokens are cached in a block + input_tokens = {"prompt_token_ids": [101] * 129} + _ = model.generate([input_tokens]) + outputs = model.generate([input_tokens]) + assert outputs[0].num_cached_tokens == 128 \ No newline at end of file diff --git a/tests/singlecard/ops/test_rotary_embedding.py b/tests/singlecard/ops/test_rotary_embedding.py index 2d5ec18da..a3504a88b 100644 --- a/tests/singlecard/ops/test_rotary_embedding.py +++ b/tests/singlecard/ops/test_rotary_embedding.py @@ -10,7 +10,9 @@ import torch import torch.nn as nn -import vllm_ascend.platform # noqa: F401 +from vllm_ascend.utils import enable_custom_op + +enable_custom_op() # Only Neox style true scenario is supported for now IS_NEOX_STYLE = [True] diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py index 484fe5f70..63484d4a0 100644 --- a/tests/singlecard/test_ascend_config.py +++ b/tests/singlecard/test_ascend_config.py @@ -58,7 +58,8 @@ def test_run_with_ascend_config(): "use_cached_graph": True, "graph_batch_sizes": [1, 2, 4, 8], "graph_batch_sizes_init": False, - "enable_multistream_shared_expert": True, + "enable_multistream_moe": True, + "enable_multistream_mla": True, }, "ascend_scheduler_config": { "enabled": True, @@ -79,7 +80,8 @@ def test_run_with_ascend_config(): 1, 2, 4, 8 ] assert not ascend_config.torchair_graph_config.graph_batch_sizes_init - assert ascend_config.torchair_graph_config.enable_multistream_shared_expert + assert ascend_config.torchair_graph_config.enable_multistream_mla + assert ascend_config.torchair_graph_config.enable_multistream_moe assert ascend_config.ascend_scheduler_config.enabled assert ascend_config.ascend_scheduler_config.enable_chunked_prefill assert ascend_config.expert_tensor_parallel_size == 1 diff --git a/tests/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py index 553d109f3..572649f9a 100644 --- a/tests/singlecard/test_offline_inference.py +++ b/tests/singlecard/test_offline_inference.py @@ -37,6 +37,10 @@ "Qwen/Qwen3-0.6B-Base", ] MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"] + +QUANTIZATION_MODELS = [ + "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8", +] os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" QUANTIZATION_MODELS = [ diff --git a/tests/singlecard/test_scheduler.py b/tests/singlecard/test_scheduler.py index d1c606278..8021f0306 100644 --- a/tests/singlecard/test_scheduler.py +++ b/tests/singlecard/test_scheduler.py @@ -31,7 +31,6 @@ from vllm.v1.structured_output import StructuredOutputManager from vllm_ascend.core.scheduler import AscendScheduler -from vllm_ascend.utils import vllm_version_is EOS_TOKEN_ID = 50256 @@ -87,27 +86,15 @@ def create_scheduler( vllm_config = VllmConfig(scheduler_config=scheduler_config, model_config=model_config, cache_config=cache_config) - - if vllm_version_is("0.9.0"): - kv_cache_config = KVCacheConfig( - num_blocks=10000, # A large number of blocks to hold all requests - tensors={}, - kv_cache_groups=[ - KVCacheGroupSpec(['layer'], - FullAttentionSpec(16, 1, 1, torch.float32, - False)) - ], - ) - else: - kv_cache_config = KVCacheConfig( - num_blocks=10000, # A large number of blocks to hold all requests - kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])], - kv_cache_groups=[ - KVCacheGroupSpec(['layer'], - FullAttentionSpec(16, 1, 1, torch.float32, - False, None)) - ], - ) + kv_cache_config = KVCacheConfig( + num_blocks=10000, # A large number of blocks to hold all requests + kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])], + kv_cache_groups=[ + KVCacheGroupSpec(['layer'], + FullAttentionSpec(16, 1, 1, torch.float32, False, + None)) + ], + ) cache_config.num_gpu_blocks = 10000 return AscendScheduler( vllm_config, @@ -135,27 +122,15 @@ def create_requests(num_requests: int, else: mm_position = None mm_inputs = None - if vllm_version_is("0.9.0"): - request = Request( - request_id=f"{i}", - prompt_token_ids=[i] * num_tokens, - sampling_params=sampling_params, - multi_modal_inputs=mm_inputs, - multi_modal_placeholders=mm_position, - multi_modal_hashes=None, - arrival_time=0, - eos_token_id=EOS_TOKEN_ID, - ) - else: - request = Request( - request_id=f"{i}", - prompt_token_ids=[i] * num_tokens, - sampling_params=sampling_params, - multi_modal_inputs=mm_inputs, - multi_modal_placeholders=mm_position, - multi_modal_hashes=None, - eos_token_id=EOS_TOKEN_ID, - ) + request = Request( + request_id=f"{i}", + prompt_token_ids=[i] * num_tokens, + sampling_params=sampling_params, + multi_modal_inputs=mm_inputs, + multi_modal_placeholders=mm_position, + multi_modal_hashes=None, + eos_token_id=EOS_TOKEN_ID, + ) requests.append(request) return requests diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py index d42f145e4..defa7fd3d 100644 --- a/vllm_ascend/ascend_config.py +++ b/vllm_ascend/ascend_config.py @@ -56,10 +56,13 @@ def __init__(self, torchair_graph_config): "graph_batch_sizes", []) self.graph_batch_sizes_init = torchair_graph_config.get( "graph_batch_sizes_init", False) - self.enable_multistream_shared_expert = torchair_graph_config.get( - "enable_multistream_shared_expert", False) + self.enable_multistream_mla = torchair_graph_config.get( + "enable_multistream_mla", False) + self.enable_multistream_moe = torchair_graph_config.get( + "enable_multistream_moe", False) self.enable_view_optimize = torchair_graph_config.get( "enable_view_optimize", True) + self.enable_kv_nz = torchair_graph_config.get("enable_kv_nz", False) if not isinstance(self.graph_batch_sizes, list): raise TypeError("graph_batch_sizes must be list[int]") diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py index 8f130e424..9c112ed62 100644 --- a/vllm_ascend/attention/attention.py +++ b/vllm_ascend/attention/attention.py @@ -36,10 +36,12 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ops.cache import concat_and_cache_mla -from vllm_ascend.platform import CUSTOM_OP_ENABLED +from vllm_ascend.utils import enable_custom_op from vllm_ascend.worker.model_runner import ( ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata) +_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128] + def generate_attn_mask(max_seq_len: int, dtype=torch.float16, mask_value=None): # Construct lower triangle matrix. @@ -460,7 +462,7 @@ def advance_step(self, for i in range(num_queries): self.seq_lens[i] += 1 self.max_decode_seq_len = max(self.seq_lens) - if CUSTOM_OP_ENABLED: + if enable_custom_op(): #advance a step on NPU for existing inputs for a multi-step runner if custom ops is enabled torch.ops._C.advance_step_flashattn_ascendc( num_seqs=num_seqs, @@ -1005,6 +1007,15 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + # TODO: support numHeads / numKvHeads < 16 in MLA kernel + if self.torchair_graph_enabled: + assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \ + ("The allowed number of queries per kv when enabling both MLA and Graph mode" + " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite," + " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1," + " please make sure after the tensor parallel split, num_heads / num_kv_heads in " + "{32, 64, 128}.") + def exec_kv( self, hidden_states: torch.Tensor, diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 70fb44df8..43cb71c68 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -14,11 +14,13 @@ from vllm.utils import cdiv, round_down from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV from vllm_ascend.attention.attention_v1 import AscendAttentionState from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig from vllm_ascend.multistream.context import get_multistream_comm_context from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla +from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput @@ -551,15 +553,29 @@ def __init__( self.o_proj = kwargs['o_proj'] self.kv_a_proj_with_mqa = kwargs.get('kv_a_proj_with_mqa', None) self.kv_a_layernorm = kwargs.get('kv_a_layernorm', None) + self.num_queries_per_kv = self.num_heads // self.num_kv_heads ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz + self.enable_multistream_mla = \ + ascend_config.torchair_graph_config.enable_multistream_mla + # Adapt torch air graph mode with spec decoding. speculative_config = get_current_vllm_config().speculative_config if speculative_config is not None: self.spec_token_num = speculative_config.num_speculative_tokens assert self.spec_token_num > 0 + # TODO: support numHeads / numKvHeads < 16 in MLA kernel + if self.torchair_graph_enabled: + assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \ + ("The allowed number of queries per kv when enabling both MLA and Graph mode" + " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite," + " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1," + " please make sure after the tensor parallel split, num_heads / num_kv_heads in " + "{32, 64, 128}.") + def _v_up_proj_and_o_proj(self, x): # Convert from (B, N, L) to (N, B, L) x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) @@ -848,7 +864,40 @@ def exec_kv( kv = self.kv_a_proj_with_mqa(hidden_states)[0] # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) - k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache( + cache_mode = "PA_NZ" if self.enable_kv_nz else "PA" + with npu_stream_switch("mla_secondary", + 0, + enabled=self.enable_multistream_mla): + k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache( + kv, + self.kv_a_layernorm.weight, + cos, + sin, + slots.to(torch.int64), + kv_cache[1], + kv_cache[0], + epsilon=self.kv_a_layernorm.variance_epsilon, + cache_mode=cache_mode, + ) + return k_pe, k_nope + + def exec_kv_prefill( + self, + hidden_states: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + kv_cache: Tuple, + slots: torch.Tensor, + ): + + B = hidden_states.shape[0] + N = self.num_kv_heads + S = 1 + kv = self.kv_a_proj_with_mqa(hidden_states)[0] + # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] + kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) + cache_mode = "PA_BLK_NZ" if self.enable_kv_nz else "PA" + _, _, k_pe, k_nope = torch_npu.npu_kv_rmsnorm_rope_cache( kv, self.kv_a_layernorm.weight, cos, @@ -857,7 +906,8 @@ def exec_kv( kv_cache[1], kv_cache[0], epsilon=self.kv_a_layernorm.variance_epsilon, - cache_mode="PA", + cache_mode=cache_mode, + is_output_kv=True, ) return k_pe, k_nope @@ -895,34 +945,42 @@ def _forward_decode( # TorchAir's shape is [bs, num_heads_per_rank, q_seq_len, dim] if attn_metadata.attn_state == AscendAttentionState.SpecDecoding: assert num_tokens % self.spec_token_num == 0 - q_nope = (q_nope.view( - num_tokens // (self.spec_token_num + 1), - self.spec_token_num + 1, - self.num_heads, - -1, - ).transpose(1, 2).contiguous()) - q_pe = (q_pe.view( - num_tokens // (self.spec_token_num + 1), - self.spec_token_num + 1, - self.num_heads, - -1, - ).transpose(1, 2).contiguous()) + q_nope = q_nope.view(num_tokens // (self.spec_token_num + 1), + self.spec_token_num + 1, self.num_heads, + -1) + q_pe = q_pe.view(num_tokens // (self.spec_token_num + 1), + self.spec_token_num + 1, self.num_heads, -1) + if not self.enable_kv_nz: + q_nope = q_nope.transpose(1, 2).contiguous() + q_pe = q_pe.transpose(1, 2).contiguous() sparse_mode = 3 spec_attn_mask = attn_metadata.decode.attn_mask # type:ignore else: - q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1) - q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1) + if self.enable_kv_nz: + q_nope = q_nope.view(num_tokens, 1, self.num_heads, -1) + q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1) + else: + q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1) + q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1) sparse_mode = 0 spec_attn_mask = None # shape of knope/k_pe for npu graph mode should be: # [num_blocks, num_kv_heads, block_size, self.kv_lora_rank/self.qk_rope_head_dim] block_size = kv_c_and_k_pe_cache[0].shape[1] - k_nope = k_nope.view(-1, self.num_kv_heads, block_size, - self.kv_lora_rank) - k_pe = k_pe.view(-1, self.num_kv_heads, block_size, - self.qk_rope_head_dim) + if self.enable_kv_nz: + k_nope = k_nope.view(-1, self.num_kv_heads, + self.kv_lora_rank // 16, block_size, 16) + k_pe = k_pe.view(-1, self.num_kv_heads, + self.qk_rope_head_dim // 16, block_size, 16) + input_layout = "BSND" + else: + k_nope = k_nope.view(-1, self.num_kv_heads, block_size, + self.kv_lora_rank) + k_pe = k_pe.view(-1, self.num_kv_heads, block_size, + self.qk_rope_head_dim) + input_layout = "BNSD" - attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score( + attn_output, _ = torch_npu.npu_fused_infer_attention_score( q_nope, k_nope, k_nope, @@ -930,7 +988,7 @@ def _forward_decode( key_rope=k_pe, num_heads=self.num_heads, num_key_value_heads=self.num_kv_heads, - input_layout="BNSD", + input_layout=input_layout, atten_mask=spec_attn_mask, sparse_mode=sparse_mode, scale=self.scale, @@ -979,10 +1037,11 @@ def forward( ] num_actual_toks = attn_metadata.num_actual_tokens if k_pe is None and not self.running_in_graph: - kv_c, k_pe = self.kv_a_proj_with_mqa( - hidden_states_or_kv_c_normed)[0].split( - [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) - kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) + if not self.torchair_graph_enabled: + kv_c, k_pe = self.kv_a_proj_with_mqa( + hidden_states_or_kv_c_normed)[0].split( + [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) else: kv_c_normed = hidden_states_or_kv_c_normed assert attn_metadata.num_decodes is not None and \ @@ -995,38 +1054,55 @@ def forward( # Inputs and outputs may be padded for CUDA graphs output_padded = output output = output[:num_actual_toks, ...] - kv_c_normed = kv_c_normed[:num_actual_toks, ...] - prefill_k_c_normed = kv_c_normed[num_decode_tokens:] + if not self.torchair_graph_enabled: + kv_c_normed = kv_c_normed[:num_actual_toks, ...] + prefill_k_c_normed = kv_c_normed[num_decode_tokens:] if not self.running_in_graph: hidden_states_or_q_c = hidden_states_or_q_c[:num_actual_toks, ...] - decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens] prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:] - k_pe = k_pe[:num_actual_toks, ...] - k_pe = k_pe.unsqueeze(1) - decode_k_pe = k_pe[:num_decode_tokens] - prefill_k_pe = k_pe[num_decode_tokens:] + if not self.torchair_graph_enabled: + decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens] + k_pe = k_pe[:num_actual_toks, ...] + k_pe = k_pe.unsqueeze(1) + decode_k_pe = k_pe[:num_decode_tokens] + prefill_k_pe = k_pe[num_decode_tokens:] else: decode_hs_or_q_c = hidden_states_or_q_c if has_decode: decode_k_nope = None assert attn_metadata.decode is not None - decode_ql_nope, decode_q_pe = \ - self._q_proj_and_k_up_proj(decode_hs_or_q_c) if self.running_in_graph: seq_len = self.rotary_emb.max_position_embeddings cos = self.rotary_emb.cos_cached[:seq_len].to( - dtype=decode_q_pe.dtype) + dtype=decode_hs_or_q_c.dtype) sin = self.rotary_emb.sin_cached[:seq_len].to( - dtype=decode_q_pe.dtype) + dtype=decode_hs_or_q_c.dtype) cos = cos[attn_metadata.decode.input_positions] sin = sin[attn_metadata.decode.input_positions] cos = cos[:, None, None, :] sin = sin[:, None, None, :] - - decode_q_pe = self.rope_single(decode_q_pe, cos, sin) + # Without explicitly controlling the order, IndexByTensor operations + # would be placed after `matmul W_KV_T` hindering the overlapping of + # KvRmsNormRopeCache and SingleRope. + npu_wait_tensor(decode_hs_or_q_c, + cos, + enabled=self.enable_multistream_mla) + npu_wait_tensor(decode_hs_or_q_c, + sin, + enabled=self.enable_multistream_mla) + decode_ql_nope, decode_q_pe = \ + self._q_proj_and_k_up_proj(decode_hs_or_q_c) + if self.running_in_graph: decode_k_pe, decode_k_nope = self.exec_kv( hidden_states_or_kv_c_normed, cos, sin, kv_cache, attn_metadata.slot_mapping) + with npu_stream_switch("mla_secondary", + 0, + enabled=self.enable_multistream_mla): + npu_wait_tensor(decode_q_pe, + decode_k_pe, + enabled=self.enable_multistream_mla) + decode_q_pe = self.rope_single(decode_q_pe, cos, sin) else: decode_q_pe[...], decode_k_pe[...] = self.rotary_emb( attn_metadata.decode.input_positions, @@ -1041,22 +1117,25 @@ def forward( prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim] if self.torchair_graph_enabled: num_tokens = prefill_hs_or_q_c.shape[0] + seq_len = self.rotary_emb.max_position_embeddings + cos = self.rotary_emb.cos_cached[:seq_len].to( + dtype=prefill_q_pe.dtype) + sin = self.rotary_emb.sin_cached[:seq_len].to( + dtype=prefill_q_pe.dtype) + cos = cos[attn_metadata.prefill.input_positions] + sin = sin[attn_metadata.prefill.input_positions] + cos = cos[:, None, None, :] + sin = sin[:, None, None, :] + + prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin) + prefill_k_pe, prefill_k_nope = self.exec_kv_prefill( + hidden_states_or_kv_c_normed, cos, sin, kv_cache, + attn_metadata.slot_mapping) + + kv_c_normed = prefill_k_nope[:num_actual_toks, ...] + prefill_k_c_normed = prefill_k_nope[num_decode_tokens:] prefill_k_pe = prefill_k_pe.view(num_tokens, self.num_kv_heads, -1) - if self.rotary_emb.__class__.__name__ == 'RotaryEmbedding': - # NOTE: When scaling not specified - ori_q_pe_shape, ori_k_pe_shape = prefill_q_pe.shape, prefill_k_pe.shape - prefill_q_pe = prefill_q_pe.reshape(num_tokens, -1) - prefill_k_pe = prefill_k_pe.reshape(num_tokens, -1) - prefill_q_pe, prefill_k_pe = self.rotary_emb( - attn_metadata.prefill.input_positions, prefill_q_pe, - prefill_k_pe) - prefill_q_pe = prefill_q_pe.view(ori_q_pe_shape) - prefill_k_pe = prefill_k_pe.view(ori_k_pe_shape) - else: - prefill_q_pe, prefill_k_pe = self.rotary_emb( - attn_metadata.prefill.input_positions, prefill_q_pe, - prefill_k_pe) prefill_q = torch.cat([prefill_q_nope, prefill_q_pe], dim=-1) else: prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb( diff --git a/vllm_ascend/compilation/piecewise_backend.py b/vllm_ascend/compilation/piecewise_backend.py index 95ce69393..c6a800b3d 100644 --- a/vllm_ascend/compilation/piecewise_backend.py +++ b/vllm_ascend/compilation/piecewise_backend.py @@ -31,8 +31,6 @@ from vllm.logger import logger from vllm.utils import weak_ref_tensors -from vllm_ascend.utils import vllm_version_is - @dataclasses.dataclass class ConcreteSizeEntry: @@ -206,11 +204,7 @@ def __call__(self, *args) -> Any: # to save memory entry.output = weak_ref_tensors(output) entry.aclgraph = aclgraph - - if vllm_version_is("0.9.0"): - compilation_counter.num_cudagraph_caputured += 1 - else: - compilation_counter.num_cudagraph_captured += 1 + compilation_counter.num_cudagraph_captured += 1 # important: we need to return the output, rather than # the weak ref of the output, so that pytorch can correctly diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index 42f5d9c69..2fa31c264 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -14,23 +14,24 @@ # limitations under the License. # This file is a part of the vllm-ascend project. # +import time from collections import deque from typing import Iterable, Union from vllm.config import VllmConfig +from vllm.distributed.kv_events import KVEventBatch from vllm.logger import logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.utils import cdiv +from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler -from vllm.v1.engine import EngineCoreOutputs +from vllm.v1.engine import EngineCoreEventType, EngineCoreOutputs from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager -from vllm_ascend.utils import vllm_version_is - class AscendScheduler(Scheduler): """This Scheduler extends vllm's original v1 scheduler @@ -51,11 +52,6 @@ def __init__( self.scheduled_req_ids: set[str] = set() self.running: list[Request] = [] - if self.vllm_config.kv_transfer_config is not None and \ - self.vllm_config.kv_transfer_config.is_kv_consumer: - raise ValueError( - "AscendScheduler cannot be used for decode nodes. ") - def schedule(self) -> SchedulerOutput: if self.scheduler_config.chunked_prefill_enabled: return super().schedule() @@ -70,6 +66,9 @@ def schedule(self) -> SchedulerOutput: # Spec decode-related. scheduled_spec_decode_tokens: dict[str, list[int]] = {} + # For logging. + scheduled_timestamp = time.monotonic() + # Record scheduled LoRA requests. scheduled_loras: set[int] = set() @@ -88,6 +87,18 @@ def skip_cur_request(): self.waiting.popleft() skipped_waiting_requests.appendleft(request) + num_prealloc_computed_tokens = 0 + # P/D: skip request if still waiting for remote kvs. + if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + is_ready = self._update_waiting_for_remote_kv(request) + if is_ready: + request.status = RequestStatus.WAITING + num_prealloc_computed_tokens = ( + request.num_computed_tokens) + else: + skip_cur_request() + continue + # Check that adding the request still respects the max_loras # constraint. if (self.lora_config and request.lora_request and @@ -97,42 +108,70 @@ def skip_cur_request(): skip_cur_request() continue - prompt_limit = self._get_prompt_limit(request) - # Get already-cached tokens. - computed_blocks, num_computed_tokens = ( - self.kv_cache_manager.get_computed_blocks(request)) - num_new_tokens = request.num_tokens - num_computed_tokens - if (0 < self.scheduler_config.long_prefill_token_threshold < - num_new_tokens): - num_new_tokens = ( - self.scheduler_config.long_prefill_token_threshold) - max_tokens_in_kvcache = (self.kv_cache_config.num_blocks * - self.block_size) - prompt_limit = min(prompt_limit, max_tokens_in_kvcache) - - # Finish request that exceeds prompt_limit or kv cache size. - if num_new_tokens > prompt_limit: - logger.warning( - "Input prompt (%d tokens) is too long" - " and exceeds limit of %d", - num_new_tokens, - prompt_limit, - ) - request.status = RequestStatus.FINISHED_IGNORED - self.finished_req_ids.add(request.request_id) # type: ignore - self.waiting.popleft() - continue - - if num_new_tokens > token_budget: - # Scheduling would exceed token_budget, skip. - skip_cur_request() - continue - - assert num_new_tokens > 0 + num_external_computed_tokens = 0 + load_kv_async = False - if vllm_version_is("0.9.0"): - blocks = computed_blocks.blocks + # Get already-cached tokens. + if num_prealloc_computed_tokens == 0: + new_computed_blocks, num_native_computed_tokens = \ + self.kv_cache_manager.get_computed_blocks( + request) + + # Get externally-cached tokens if using a KVConnector. + if self.connector is not None: + num_external_computed_tokens, load_kv_async = ( + self.connector.get_num_new_matched_tokens( + request, num_native_computed_tokens)) + + # Total computed tokens (local + external). + num_computed_tokens = (num_native_computed_tokens + + num_external_computed_tokens) + else: + # P/D: skip checking prefix cache if loaded from remote kvs. + new_computed_blocks = KVCacheBlocks.create_empty() + num_native_computed_tokens = 0 + + # Total computed tokens (allocated in prior step). + num_computed_tokens = num_prealloc_computed_tokens + + # P/D: loading remote KV, do not allocate for new work. + if load_kv_async: + assert num_external_computed_tokens > 0 + num_new_tokens = 0 + blocks = None + # Number of tokens to be scheduled. else: + prompt_limit = self._get_prompt_limit(request) + # Get already-cached tokens. + computed_blocks, num_computed_tokens = ( + self.kv_cache_manager.get_computed_blocks(request)) + # We use `request.num_tokens` instead of + # `request.num_prompt_tokens` to consider the resumed + # requests, which have output tokens. + num_new_tokens = request.num_tokens - num_computed_tokens + max_tokens_in_kvcache = (self.kv_cache_config.num_blocks * + self.block_size) + prompt_limit = min(prompt_limit, max_tokens_in_kvcache) + + # Finish request that exceeds prompt_limit or kv cache size. + if num_new_tokens > prompt_limit: + logger.warning( + "Input prompt (%d tokens) is too long" + " and exceeds limit of %d", + num_new_tokens, + prompt_limit, + ) + request.status = RequestStatus.FINISHED_IGNORED + self.finished_req_ids.add( # type: ignore + request.request_id) # type: ignore + self.waiting.popleft() + continue + + if num_new_tokens > token_budget: + # Scheduling would exceed token_budget, skip. + skip_cur_request() + continue + assert num_new_tokens > 0 blocks = computed_blocks.blocks[0] watermark = getattr(self.scheduler_config, "watermark", 0.01) @@ -143,13 +182,38 @@ def skip_cur_request(): continue new_blocks = self.kv_cache_manager.allocate_slots( - request, num_new_tokens, new_computed_blocks=computed_blocks) + request, + num_new_tokens + num_external_computed_tokens, + num_native_computed_tokens, + new_computed_blocks=computed_blocks, + num_lookahead_tokens=self.num_lookahead_tokens, + delay_cache_blocks=load_kv_async) if new_blocks is None: # The request cannot be scheduled. break + # KVConnector: update internal state after allocation. + # This information is used to determine if a load is + # needed for this request. + if num_external_computed_tokens: + assert self.connector is not None + self.connector.update_state_after_alloc( + request, + new_computed_blocks + new_blocks, + num_external_computed_tokens, + ) + self.waiting.popleft() + if load_kv_async: + # If loading async, allocate memory and put request + # into the WAITING_FOR_REMOTE_KV state. + skipped_waiting_requests.appendleft(request) + request.status = RequestStatus.WAITING_FOR_REMOTE_KVS + continue self.running.append(request) + if self.log_stats: + request.record_event(EngineCoreEventType.SCHEDULED, + scheduled_timestamp) self.scheduled_req_ids.add(request.request_id) # Check request status. if request.status == RequestStatus.WAITING: @@ -168,6 +232,9 @@ def skip_cur_request(): token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens + # Count the number of prifix cached tokens. + if request.num_cached_tokens < 0: + request.num_cached_tokens = num_computed_tokens # Put back any skipped requests at the head of the waiting queue if skipped_waiting_requests: @@ -186,16 +253,45 @@ def skip_cur_request(): num_new_tokens = (request.num_tokens_with_spec - request.num_computed_tokens) - if (0 < self.scheduler_config.long_prefill_token_threshold < - num_new_tokens): - num_new_tokens = ( - self.scheduler_config.long_prefill_token_threshold) + assert (request.num_tokens - request.num_computed_tokens) == 1 num_new_tokens = min(num_new_tokens, token_budget) - assert num_new_tokens == 1 + # Make sure the input position does not exceed the max model len. + # This is necessary when using spec decoding. + num_new_tokens = min( + num_new_tokens, + self.max_model_len - request.num_computed_tokens) + # Check that adding the request still respects the max_loras + # constraint. + if self.lora_config and request.lora_request and ( + len(scheduled_loras) == self.lora_config.max_loras + and request.lora_request.lora_int_id + not in scheduled_loras): + # Scheduling would exceed max_loras, skip. + num_new_tokens = 0 + + if num_new_tokens == 0: + # The request cannot be scheduled because one of the following + # reason: + # 1. No new tokens to schedule. This may happen when PP>1 and + # we have already scheduled all prompt tokens but they are + # not finished yet. + # 2. Adding the request exceeds the max_loras constraint. + # NOTE(woosuk): Here, by doing `continue` instead of `break`, + # we do not strictly follow the FCFS scheduling policy and + # allow the lower-priority requests to be scheduled. + req_index += 1 + continue + + num_draft_tokens = max( + num_new_tokens + request.num_computed_tokens - + request.num_tokens, 0) while True: new_blocks = self.kv_cache_manager.allocate_slots( - request, num_new_tokens) + request, + num_new_tokens, + num_draft_tokens=num_draft_tokens, + num_lookahead_tokens=self.num_lookahead_tokens) if new_blocks is None: # The request cannot be scheduled. # Preempt the lowest-priority request. @@ -203,6 +299,10 @@ def skip_cur_request(): self.kv_cache_manager.free(preempted_req) preempted_req.status = RequestStatus.PREEMPTED preempted_req.num_computed_tokens = 0 + if self.log_stats: + preempted_req.record_event( + EngineCoreEventType.PREEMPTED, + scheduled_timestamp) self.waiting.appendleft(preempted_req) preempted_reqs.append(preempted_req) if preempted_req == request: @@ -237,6 +337,10 @@ def skip_cur_request(): scheduled_spec_decode_tokens[request.request_id] = ( request.spec_token_ids) + # Record scheduled LoRA requests. + if self.lora_config and request.lora_request: + scheduled_loras.add(request.lora_request.lora_int_id) + # Check if the scheduling constraints are satisfied. total_num_scheduled_tokens = sum(num_scheduled_tokens.values()) assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens @@ -304,6 +408,11 @@ def skip_cur_request(): meta = self.connector.build_connector_meta(scheduler_output) scheduler_output.kv_connector_metadata = meta + events = self.kv_cache_manager.take_events() + if events: + batch = KVEventBatch(ts=time.time(), events=events) + self.kv_event_publisher.publish(batch) + # Advance the number of computed tokens for the request AFTER # the request is scheduled. # 1. The scheduler_output of the current step has to include the @@ -330,14 +439,8 @@ def _check_watermark_for_prefill(self, len(computed_blocks) * self.block_size) num_required_blocks = cdiv(num_new_tokens + num_computed_tokens, self.block_size) - - if vllm_version_is("0.9.0"): - req_blocks = self.kv_cache_manager.single_type_manager.req_to_blocks[ - request.request_id] - else: - req_blocks = self.kv_cache_manager.coordinator.get_blocks( - request.request_id) - + req_blocks = self.kv_cache_manager.coordinator.get_blocks( + request.request_id) num_new_blocks = (num_required_blocks - len(req_blocks) - len(computed_blocks)) num_evictable_computed_blocks = sum(1 for blk in computed_blocks @@ -401,7 +504,8 @@ def update_from_output( if num_tokens_scheduled == 0: # The request was not scheduled in this step. continue - self.scheduled_req_ids.remove(req_id) + if req_id in self.scheduled_req_ids: + self.scheduled_req_ids.remove(req_id) return super().update_from_output(scheduler_output, model_runner_output) diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py index c3de6ae93..9db49cbff 100644 --- a/vllm_ascend/models/deepseek_dbo.py +++ b/vllm_ascend/models/deepseek_dbo.py @@ -29,7 +29,7 @@ import torch import torch.distributed as dist -import torch_npu +import torch_npu # noqa: F401 import vllm.envs as envs from torch import nn from transformers import PretrainedConfig @@ -40,13 +40,10 @@ get_tp_group, tensor_model_parallel_all_reduce) from vllm.distributed.parallel_state import get_dp_group from vllm.forward_context import get_forward_context -from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, ReplicatedLinear, - RowParallelLinear, - UnquantizedLinearMethod) + RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope @@ -67,6 +64,7 @@ import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2MLP from vllm_ascend.multistream.base import MSEventKey from vllm_ascend.multistream.context import ( advance_step_multistream_layer_context, get_multistream_comm_context, @@ -78,117 +76,17 @@ make_multistream_metadata_ds) from vllm_ascend.multistream.ms_split import compute_split_seq_index from vllm_ascend.ops.fused_moe import AscendFusedMoE -from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod from vllm_ascend.utils import dispose_tensor VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 -class CustomDeepseekDBOMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - quant_config: Optional[QuantizationConfig] = None, - reduce_results: bool = True, - prefix: str = "", - ) -> None: - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config, - reduce_results=reduce_results, - prefix=f"{prefix}.down_proj") - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - # NOTE: `torch_npu.npu_dequant_swiglu_quant` can only be enabled in dynamic quant - self.is_dynamic_quant = not isinstance( - self.gate_up_proj.quant_method, - UnquantizedLinearMethod) and isinstance( - self.gate_up_proj.quant_method.quant_method, - AscendW8A8DynamicLinearMethod) - - def forward(self, x): - if self.is_dynamic_quant: - x, dynamic_scale = torch_npu.npu_dynamic_quant(x) - x = torch_npu.npu_quant_matmul( - x, - self.gate_up_proj.weight, - self.gate_up_proj.weight_scale, - output_dtype=torch.int32, - ) - x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant( - x=x, - weight_scale=self.gate_up_proj.weight_scale_fp32, - activation_scale=dynamic_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=None, - activate_left=True, - quant_mode=1) - x = torch_npu.npu_quant_matmul( - x, - self.down_proj.weight, - self.down_proj.weight_scale, - pertoken_scale=dynamic_scale, - output_dtype=torch.bfloat16, - ) - if self.down_proj.reduce_results and self.down_proj.tp_size > 1: - x = tensor_model_parallel_all_reduce(x) - return x - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x +class CustomDeepseekDBOMLP(CustomDeepseekV2MLP): def _forward_ms_mlp(self, x): current_ms_metadata = get_multistream_comm_context() assert current_ms_metadata is not None - if self.is_dynamic_quant: - x, dynamic_scale = torch_npu.npu_dynamic_quant(x) - x = torch_npu.npu_quant_matmul( - x, - self.gate_up_proj.weight, - self.gate_up_proj.weight_scale, - output_dtype=torch.int32, - ) - x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant( - x=x, - weight_scale=self.gate_up_proj.weight_scale_fp32, - activation_scale=dynamic_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=None, - activate_left=True, - quant_mode=1) - x = torch_npu.npu_quant_matmul( - x, - self.down_proj.weight, - self.down_proj.weight_scale, - pertoken_scale=dynamic_scale, - output_dtype=torch.bfloat16, - ) - if self.down_proj.reduce_results and self.down_proj.tp_size > 1: - current_ms_metadata.before_comm_event.record() - with torch.npu.stream(current_ms_metadata.comm_stream): - current_ms_metadata.before_comm_event.wait() - x = tensor_model_parallel_all_reduce(x) - current_ms_metadata.after_comm_event.record() - return x gate_up, _ = self.gate_up_proj(x) x = self.act_fn(gate_up) current_ms_metadata.before_comm_event.record() diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py index 96c76338b..0ae1142a3 100644 --- a/vllm_ascend/models/deepseek_v2.py +++ b/vllm_ascend/models/deepseek_v2.py @@ -25,7 +25,7 @@ # # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py # """Inference-only DeepseekV2/DeepseekV3 model.""" -from typing import Any, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch import torch.distributed as dist @@ -69,12 +69,74 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_ep_group from vllm_ascend.ops.fused_moe import AscendFusedMoE +from vllm_ascend.quantization.quant_config import AscendLinearMethod from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod -from vllm_ascend.utils import dispose_tensor +from vllm_ascend.utils import (dispose_tensor, npu_stream_switch, + npu_wait_tensor) VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 +class CustomDeepseekV2SiluAndMul(SiluAndMul): + + def __init__(self, + *, + weight_scale: Optional[Callable[[], torch.Tensor]] = None): + super().__init__() + self.weight_scale = weight_scale + + def forward_oot(self, x: Union[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]): + if isinstance(x, tuple): + assert self.weight_scale is not None + # For AscendW8A8DynamicLinearMethod: + # a dynamic scale is passed along with the quantized value. + quantized_x, dynamic_scale = x + return torch_npu.npu_dequant_swiglu_quant( + x=quantized_x, + weight_scale=self.weight_scale(), + activation_scale=dynamic_scale, + activate_left=True, + quant_mode=1) + else: + return super().forward_oot(x) + + +class CustomDeepseekV2MergedReplicatedLinear(ReplicatedLinear): + + def __init__( + self, + input_size: int, + output_sizes: list[int], + bias: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + self.output_sizes = output_sizes + super().__init__(input_size, + sum(output_sizes), + bias=bias, + quant_config=quant_config, + prefix=prefix) + + def weight_loader(self, param: torch.nn.Parameter, + loaded_weight: torch.Tensor, loaded_shard_id: int): + # With no support for GGUF format yet. + assert not getattr(param, "is_gguf_weight", False) + assert not getattr(param, "is_gguf_weight_type", False) + + assert loaded_shard_id < len(self.output_sizes) + shard_offset = sum(self.output_sizes[:loaded_shard_id]) + shard_size = self.output_sizes[loaded_shard_id] + shard = param.data.narrow(param.output_dim, shard_offset, shard_size) + + assert shard.size() == loaded_weight.size(), ( + f"Tried to load weights of size {loaded_weight.size()}" + f"to a parameter shard of id {loaded_shard_id} size {shard.size()}" + ) + shard.copy_(loaded_weight) + + class CustomDeepseekV2MLP(nn.Module): def __init__( @@ -84,61 +146,68 @@ def __init__( hidden_act: str, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True, + force_replicate: bool = False, prefix: str = "", ) -> None: super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - quant_config=quant_config, - prefix=f"{prefix}.gate_up_proj") - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config, - reduce_results=reduce_results, - prefix=f"{prefix}.down_proj") + if not force_replicate: + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") + else: + self.gate_up_proj = CustomDeepseekV2MergedReplicatedLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = ReplicatedLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") - self.act_fn = SiluAndMul() - # NOTE: `torch_npu.npu_dequant_swiglu_quant` can only be enabled in dynamic quant - self.is_dynamic_quant = not isinstance( - self.gate_up_proj.quant_method, - UnquantizedLinearMethod) and isinstance( - self.gate_up_proj.quant_method.quant_method, - AscendW8A8DynamicLinearMethod) + quant_method = self.gate_up_proj.quant_method + if isinstance(quant_method, UnquantizedLinearMethod): + self.act_fn = CustomDeepseekV2SiluAndMul() + elif (isinstance(quant_method, AscendLinearMethod) and isinstance( + quant_method.quant_method, AscendW8A8DynamicLinearMethod)): + # TODO(sdmyzlp): Currently preserved as before: + # 1. The only quantization supported for silu is W8A8Dynamic + # 2. Output dtype of gate_up/down is fixed to be int32/bfloat16 + # + # Maybe one can implement a better and more general configuration + # scheme, e.g. by somehow passing around the tweaked `quant_config` + self.act_fn = CustomDeepseekV2SiluAndMul( + # Use lazy binding, for `weight_scale_fp32` is accessible + # only after `process_weights_after_loading`. + weight_scale=lambda: self.gate_up_proj.weight_scale_fp32) + # To be consumed by AscendW8A8DynamicLinearMethod.apply() + self.gate_up_proj._ascend_quant_config = { + "output_dtype": torch.int32, + "pertoken_scale": False, + "return_scale": True, + } + self.down_proj._ascend_quant_config = { + "output_dtype": torch.bfloat16, + "pertoken_scale": True, + "return_scale": False, + } + else: + raise NotImplementedError( + f"Quantization with [{type(quant_method)}] is NOT supported") def forward(self, x): - if self.is_dynamic_quant: - x, dynamic_scale = torch_npu.npu_dynamic_quant(x) - x = torch_npu.npu_quant_matmul( - x, - self.gate_up_proj.weight, - self.gate_up_proj.weight_scale, - output_dtype=torch.int32, - ) - x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant( - x=x, - weight_scale=self.gate_up_proj.weight_scale_fp32, - activation_scale=dynamic_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=None, - activate_left=True, - quant_mode=1) - x = torch_npu.npu_quant_matmul( - x, - self.down_proj.weight, - self.down_proj.weight_scale, - pertoken_scale=dynamic_scale, - output_dtype=torch.bfloat16, - ) - if self.down_proj.reduce_results and self.down_proj.tp_size > 1: - x = tensor_model_parallel_all_reduce(x) - return x gate_up, _ = self.gate_up_proj(x) x = self.act_fn(gate_up) x, _ = self.down_proj(x) @@ -169,6 +238,12 @@ def __init__( raise ValueError(f"Unsupported activation: {config.hidden_act}. " "Only silu is supported for now.") + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on + self.enable_multistream_moe = \ + ascend_config.torchair_graph_config.enable_multistream_moe and VLLM_ENABLE_MC2 + self.gate = ReplicatedLinear(config.hidden_size, config.n_routed_experts, bias=False, @@ -204,8 +279,11 @@ def __init__( hidden_act=config.hidden_act, quant_config=quant_config, reduce_results=True, + force_replicate=self.enable_multistream_moe, prefix=f"{prefix}.shared_experts", ) + else: + self.shared_experts = None # type: ignore CustomDeepseekV2MoE.top_k = config.num_experts_per_tok self.dp_size = get_dp_group().world_size @@ -216,12 +294,6 @@ def __init__( self.params_dtype = torch.get_default_dtype() - ascend_config = get_ascend_config() - self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on - self.enable_multistream_shared_expert = \ - ascend_config.torchair_graph_config.enable_multistream_shared_expert and VLLM_ENABLE_MC2 - def forward( self, hidden_states: torch.Tensor, @@ -240,12 +312,10 @@ def forward( enable_force_load_balance = False if hasattr(attn_metadata, 'with_prefill_across_dp'): is_prefill = is_prefill or attn_metadata.with_prefill_across_dp - num_tokens, hidden_size = hidden_states.shape - - multistream = self.enable_multistream_shared_expert and not is_prefill - - old_hidden_states = hidden_states.clone() + old_hidden_states = hidden_states + use_separated_shared_experts = (self.shared_experts is not None + and not self.enable_multistream_moe) if self.tp_size > 1: if (VLLM_ENABLE_MC2 @@ -262,25 +332,22 @@ def forward( # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - kwargs = {} - if multistream: - kwargs.update({ - "shared_experts": self.shared_experts, - "shared_hidden_states": old_hidden_states - }) - - hidden_states = self.experts( + experts_hidden_states = self.experts( hidden_states=hidden_states, router_logits=router_logits, is_prefill=is_prefill, top_k=CustomDeepseekV2MoE.top_k, enable_force_load_balance=enable_force_load_balance, - **kwargs) - - if multistream: - hidden_states, shared_output = hidden_states + shared_experts=(self.shared_experts + if not use_separated_shared_experts else None), + ) - hidden_states = hidden_states * self.routed_scaling_factor + if not isinstance(experts_hidden_states, tuple): + hidden_states = experts_hidden_states * self.routed_scaling_factor + else: + hidden_states = ( + experts_hidden_states[0] * self.routed_scaling_factor + + experts_hidden_states[1]) if self.tp_size > 1: if (VLLM_ENABLE_MC2 @@ -294,12 +361,9 @@ def forward( else: hidden_states = tensor_model_parallel_all_reduce(hidden_states) - if self.n_shared_experts is not None: - if not multistream: - shared_output = self.shared_experts(old_hidden_states) - - if shared_output is not None: - hidden_states = hidden_states + shared_output + if use_separated_shared_experts: + hidden_states = hidden_states + self.shared_experts( + old_hidden_states) return hidden_states.view(num_tokens, hidden_size) @@ -433,6 +497,8 @@ def __init__( ascend_config = get_ascend_config() self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_multistream_mla = \ + ascend_config.torchair_graph_config.enable_multistream_mla def forward( self, @@ -442,7 +508,14 @@ def forward( attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: if self.q_lora_rank is not None: ckq = self.q_a_proj(hidden_states)[0] - hidden_states_or_q_c = self.q_a_layernorm(ckq) + use_multistream_mla = (self.enable_multistream_mla + and attn_metadata is not None + and attn_metadata.num_decodes > 0) + npu_wait_tensor(hidden_states, ckq, enabled=use_multistream_mla) + with npu_stream_switch("mla_secondary", + 0, + enabled=use_multistream_mla): + hidden_states_or_q_c = self.q_a_layernorm(ckq) else: hidden_states_or_q_c = hidden_states if self.torchair_graph_enabled: diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index 25f3b05d5..d6115d35c 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -16,7 +16,7 @@ # Adapted from vllm/tests/kernels/test_moe.py import os -from typing import Callable, List, Optional +from typing import Any, Callable, List, Optional, Tuple, Union import torch import torch.distributed as dist @@ -36,6 +36,7 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer +from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 USING_LCCL_COM: bool = envs_ascend.USING_LCCL_COM @@ -106,15 +107,17 @@ def process_topk_ids(topk_ids: torch.Tensor, expert_num: int, ep_size: int, return topk_ids_pad, unpad_indices -def fused_experts_with_mc2(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - top_k: int, - expert_map: torch.Tensor = None, - moe_all_to_all_group_name: Optional[str] = None, - **kwargs) -> torch.Tensor: +def fused_experts_with_mc2( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None, + moe_all_to_all_group_name: Optional[str] = None, + shared_experts: Optional[Any] = None +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: global_bs = 0 moe_expert_num = len(expert_map) kwargs_mc2 = { @@ -154,6 +157,13 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[ 0:5] + if shared_experts is not None: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(hidden_states, topk_weights) + shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states) + npu_wait_tensor(shared_gate_up, expand_x) + shared_act = shared_experts.act_fn(shared_gate_up) + w1 = w1.transpose(1, 2) group_list = expert_token_nums.to(torch.int64) @@ -210,7 +220,13 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2) - return hidden_states + if shared_experts is None: + return hidden_states + else: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(shared_act, down_out_list) + shared_hidden_states, _ = shared_experts.down_proj(shared_act) + return hidden_states, shared_hidden_states def apply_mlp(hidden_states_wrapper: List[torch.Tensor], @@ -875,6 +891,7 @@ def apply( e_score_correction_bias: Optional[torch.Tensor] = None, is_prefill: bool = False, enable_force_load_balance: bool = False, + shared_experts: Optional[Any] = None, **kwargs, ) -> torch.Tensor: @@ -924,7 +941,7 @@ def apply( top_k=top_k, expert_map=expert_map, moe_all_to_all_group_name=self.moe_all_to_all_group_name, - **kwargs) + shared_experts=shared_experts) elif self.torchair_graph_enabled or get_ep_group().world_size == 1: return fused_experts(hidden_states=x, w1=layer.w13_weight, @@ -1053,9 +1070,6 @@ def __init__( self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled - # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on - self.enable_multistream_shared_expert = \ - ascend_config.torchair_graph_config.enable_multistream_shared_expert and VLLM_ENABLE_MC2 if self.scoring_func != "softmax" and not self.use_grouped_topk: raise ValueError("Only softmax scoring function is supported for " @@ -1102,8 +1116,8 @@ def forward(self, router_logits: torch.Tensor, is_prefill: bool, enable_force_load_balance: bool = False, - top_k=None, - **kwargs): + top_k: Optional[int] = None, + shared_experts: Optional[Any] = None): assert self.quant_method is not None if top_k: @@ -1132,7 +1146,7 @@ def forward(self, hidden_states, router_logits) # Matrix multiply. - hidden_states = self.quant_method.apply( + e_hidden_states = self.quant_method.apply( layer=self, x=hidden_states, router_logits=router_logits, @@ -1150,36 +1164,39 @@ def forward(self, enable_force_load_balance=enable_force_load_balance, log2phy=self.log2phy, global_redundant_expert_num=self.global_redundant_expert_num, - **kwargs) + shared_experts=shared_experts, + ) - if self.enable_multistream_shared_expert and not is_prefill: - hidden_states, shared_output = hidden_states + if shared_experts is not None: + # Provide dummy implementation of "non-separated" shared experts. + if not isinstance(e_hidden_states, tuple): + return e_hidden_states, shared_experts(hidden_states) + else: + return e_hidden_states if self.dp_size > 1: if VLLM_ENABLE_MC2 and not is_prefill: ... elif self.torchair_graph_enabled: if USING_LCCL_COM: # type: ignore - hidden_states = dist._functional_collectives.reduce_scatter_tensor( - hidden_states, + e_hidden_states = dist._functional_collectives.reduce_scatter_tensor( + e_hidden_states, "sum", scatter_dim=0, group=get_dp_group().device_group) elif self.torchair_graph_enabled and not is_prefill: - hidden_states = dist._functional_collectives.reduce_scatter_tensor( - hidden_states, + e_hidden_states = dist._functional_collectives.reduce_scatter_tensor( + e_hidden_states, "sum", scatter_dim=0, group=get_dp_group().device_group) else: - hidden_states = get_ep_group().combine(hidden_states) + e_hidden_states = get_ep_group().combine(e_hidden_states) if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1): - hidden_states = tensor_model_parallel_all_reduce(hidden_states) + e_hidden_states = tensor_model_parallel_all_reduce(e_hidden_states) - if self.enable_multistream_shared_expert and not is_prefill: - return hidden_states, shared_output - return hidden_states + return e_hidden_states # ----------------------------------------- TBO-related -------------------------------------------- diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 0c2a00afb..9f8ae784c 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -22,11 +22,12 @@ from vllm.model_executor.layers.rotary_embedding import ( DeepseekScalingRotaryEmbedding, RotaryEmbedding) -from vllm_ascend.platform import CUSTOM_OP_ENABLED +from vllm_ascend.utils import enable_custom_op def custom_rotary_embedding_enabled(query, neox_style, head_size): - return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and CUSTOM_OP_ENABLED + return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and enable_custom_op( + ) def rope_forward_oot( diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index ccf9bd9e0..3c24bfc70 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -24,9 +24,9 @@ # each worker's `__init__` function. # # Then in each kind of patch, there are three folders: -# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0. +# - patch_0_9_1: contains the patches applied when vllm version is 0.9.1. # - patch_main: contains the patches applied when vllm version is main branch. -# - patch_common: contains the patches applied in both 0.9.0 and main branch. +# - patch_common: contains the patches applied in both 0.9.1 and main branch. # # Once a new patch is added in vllm-ascend, please add the patch description into this file as well. # ---------------------------------------------------------------------------------- @@ -44,56 +44,40 @@ # platform owned `CoordinatorGroup` to make sure all the CoordinateGroup can be properly destroyed # How: # Call `vllm_ascend.distributed.parallel_state method `destroy_platform_model_parallel` to destroy all the `CoordinateGroup` -# Related PR (if no, explain why): no related PR, we want add this ability into vllm +# Related PR (if no, explain why): # Future Plan: # Remove those patch when vllm merged them -# 2. `vllm.distributed.stateless_init_torch_distributed_process_group()` +# 2. `vllm.v1.engine.core.DPEngineCoreProc._init_data_parallel` # Why: -# The stateless process group can not be initialized except from gloo and nccl backend, vllm-ascend -# needs to initialize its own stateless process group for communication, so we add the platform related -# call to the `stateless_init_torch_distributed_process_group`, to enable other platform which may support -# stateless process group initialize method +# There is some bug for ASCEND_RT_VISIBLE_DEVICES usage. # How: -# rewrite stateless_init_torch_distributed_process_group to judge if there is a stateless process group initialize -# method and call platform method `platform_register_backend` to initialize them -# Related PR (if no, explain why): no related PR, we want add this ability into vllm +# The ASCEND_RT_VISIBLE_DEVICES related code is dropped. +# Related PR (if no, explain why): +# No, this is a bug for vllm ascend # Future Plan: -# Remove those patch when vllm merged them -# 3. `ParallelConfig.get_next_dp_init_port` +# Remove this patch once ASCEND_RT_VISIBLE_DEVICES bug is fixed. +# 3. `vllm.config.ParallelConfig.get_next_dp_init_port` # Why: -# We want to get dp port from env variable, so the multi-node inference can be properly initialized and run. +# vllm doesn't support get port from environment. # How: -# Get the dp port from env variable enable multi-mode dp inference -# Related PR (if no, explain why): no related PR, we want add this ability into vllm +# Add the logic to get port from environment. +# Related PR (if no, explain why): +# Need a PR to vllm to support get port from environment. # Future Plan: -# Its a workaround in vllm-ascend to enable multi-node dp inference, maybe removed if vllm have better plan -# on multi-node dp inference implementation -# 4. `ParallelConfig.stateless_init_dp_group` +# Remove those patch when vllm merged them +# 4. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group` # Why: # vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to # get better performance # How: -# adopt nccl backend to init process group -# Related PR (if no, explain why): no related PR, we want add this ability into vllm +# adopt nccl backend to init process group.(Now we still use gloo, it's just a placeholder, we'll use nccl in the future) +# Related PR (if no, explain why): +# Need a PR to vllm to support more backend. # Future Plan: -# Remove those patch when vllm merged them -# +# Remove those patch when vllm support more backend. # # * Worker Patch: # =============== -# ** File: worker/patch_common/patch_metrics.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.spec_decode.metrics.AsyncMetricsCollector.maybe_collect_rejsample_metrics` -# Why: -# There are cuda hard code (current_platform.is_cuda_alike()) in -# `AsyncMetricsCollector.maybe_collect_rejsample_metrics` -# How: -# Change to use `current_platform.Event` to determine whether to return None -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... -# https://github.com/vllm-project/vllm/pull/14411 -# Future Plan: -# Revert it when the related pr is merged in vllm. -# # ** File: worker/patch_common/patch_minicpm.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.model_executor.models.minicpm.MiniCPMAttention.forward` @@ -103,7 +87,7 @@ # However float32 is not supported in cann rope op, thus we keep this patch # How: # Removed the dtype convert operations in forward -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # NO, only for npu due to rope op. # Future Plan: # Keep this patch in vllm-ascend. @@ -119,7 +103,7 @@ # - support attention metadata register to the set supported spec decode # - offer a api in platform to determine whether spec decode is supported, # and deprecate is_cuda_alike in it. -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm/pull/15195 # - https://github.com/vllm-project/vllm-ascend/pull/395 # Future Plan: @@ -131,14 +115,14 @@ # vLLM `Remove Sampler from Model Code` so vllm-ascend needs adapt to this change. # How: # Use vLLM 0.8.4 method to patch it. -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm/pull/15195 # - https://github.com/vllm-project/vllm-ascend/pull/395 # Future Plan: # Remove it when we identify the reasons clearly. # # ** File: worker/patch_common/patch_spec_decode_worker.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker` # Why: # We need to use the patched `TP1DraftModelRunner` in `SpecDecodeWorker.create_worker`. @@ -146,14 +130,14 @@ # `FlashAttentionMetadata` # How: # ditto -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm/pull/15195 # - https://github.com/vllm-project/vllm-ascend/pull/395 # Future Plan: # Revert it when the related pr is merged in vllm and vllm-ascend. # # ** File: worker/patch_common/patch_eagle.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.spec_decode.eagle.prepare_inputs` # Why: # We need to use the patched `prepare_input_kernel` in `eagle.prepare_inputs`. @@ -161,12 +145,12 @@ # kernel, ascend is now not support triton kernel. # How: # Re-implementation the `prepare_input_kernel` triton kernel by pytorch -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - Ascend doesn't support triton # Future Plan: # Revert it when the ascend support triton kernel. # -# ** File: v1/sample/sampler.py ** +# ** File: worker/patch_common/patch_sampler.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. `vllm.v1.sample.sampler.Sampler.apply_top_k_top_p` # Why: @@ -175,21 +159,44 @@ # to improve performance. # How: # Re-implementation the `apply_top_k_top_p` function by pytorch -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm-ascend/pull/970 # Future Plan: # Revert it when the ascend scatter performance improves. # -# ** File: v1/sample/sampler.py ** -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~s -# 1. `vllm.v1.sample.sampler.Sampler.apply_min_p` +# 2. `vllm.v1.sample.sampler.Sampler.apply_min_p` # Why: # We need to use the patched `apply_min_p` in `sample`. # The mainly reason to overwrite `apply_min_p` is # to improve performance. # How: # Re-implementation the `apply_min_p` function by pytorch -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm-ascend/pull/970 # Future Plan: # Revert it when the ascend indexput performance improves. +# +# ** File: worker/patch_common/patch_distributed.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.distributed.parallel_state.GroupCoordinator` +# Why: +# vllm doesn't support all_to_all for GroupCoordinator. +# How: +# Add all_to_all implementation for GroupCoordinator. +# Related PR (if no, explain why): +# Need a PR to vllm to support all_to_all for GroupCoordinator. +# Future Plan: +# Remove this patch when vllm merged them. +# +# ** File: worker/patch_common/patch_utils.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.utils.direct_register_custom_op` +# Why: +# pytorch 2.7.o is not compatible with pytorch 2.5.1. While vllm is based on pytorch 2.7.0, but vllm ascend +# is based on pytorch 2.5.1, so we need to use this patch to make vllm compatible with pytorch 2.5.1. +# How: +# patch __annotations__ check to make it compatible with pytorch 2.5.1. +# Related PR (if no, explain why): +# This is the problem in vllm-ascend +# Future Plan: +# Remove this patch once pytorch 2.7.0 is supported for vllm ascend. diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py index e724fe531..4ec38e38d 100644 --- a/vllm_ascend/patch/platform/__init__.py +++ b/vllm_ascend/patch/platform/__init__.py @@ -17,8 +17,8 @@ from vllm_ascend.utils import vllm_version_is # Import specific patches for different versions -if vllm_version_is("0.9.0"): - from vllm_ascend.patch.platform import patch_0_9_0 # noqa: F401 +if vllm_version_is("0.9.1"): + from vllm_ascend.patch.platform import patch_0_9_1 # noqa: F401 from vllm_ascend.patch.platform import patch_common # noqa: F401 else: from vllm_ascend.patch.platform import patch_common # noqa: F401 diff --git a/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py b/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py deleted file mode 100644 index d468326bd..000000000 --- a/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py +++ /dev/null @@ -1,116 +0,0 @@ -import torch -from torch.distributed import ProcessGroup -from torch.distributed.distributed_c10d import (Backend, PrefixStore, - _get_default_timeout, - is_nccl_available) -from torch.distributed.rendezvous import rendezvous -from vllm.distributed import utils - - -def stateless_init_torch_distributed_process_group( - host: str, port: int, rank: int, world_size: int, - backend: str) -> ProcessGroup: - """ - A replacement for `torch.distributed.init_process_group` that does not - pollute the global state. The created ProcessGroup object can be used for - some operations such as `allreduce`, because it does not depend on the - global rank. However, some operations such as `broadcast` cannot be used - because it depends on the global rank. - - # TODO: ask for help from PyTorch team if we need the `broadcast` operation. - - This function is useful when we are not sure about the total number of - processes in the process group. For example, we may have process - 1, 2, ..., 8 who want to communicate, and process 9 might be the same - process as process 1, or it might be a different process; process 10 - might be the same process as process 5, or it might be a different process. - In this case, how can we reliably form a communication channel within - process 9 and 10, without affecting the communication channel within - process 1, 2, ..., 8? - - One possible solution is to figure out if process 9 and 10 are the same - as process 1 and 5 beforehand, and then form a communication channel - based on the information, adjusting the ranks and world_size etc. However, - figuring out the information is not always easy, and it will interfere - with the main communication channel. - - Our solution is to always form a communication channel with process 1, 2, - ..., 8, and then use this function to form another communication channel - with process 9 and 10. This way, regardless of whether process 9 and 10 - are the same as process 1 and 5, the main communication channel is - always formed with process 1, 2, ..., 8, and the additional communication - channel is formed with process 9 and 10. - """ - init_method = f"tcp://{host}:{port}" - backend = Backend(backend) # it is basically string - timeout = _get_default_timeout(backend) - - store, rank, world_size = next( - rendezvous(init_method, rank, world_size, timeout=timeout)) - store.set_timeout(timeout) - - group_rank = rank - group_size = world_size - - # Use a PrefixStore to avoid accidental overrides of keys used by - # different systems (e.g. RPC) in case the store is multi-tenant. - prefix_store = PrefixStore(init_method, store) - - # TODO(Yizhou): The reason we need to set options while vllm does not - # seems to be related to the version of PyTorch. In the latest version, - # there is no need to set options. While in the older version, 2.5.1 - # specifically, we need to set options. - options = ProcessGroup.Options(backend=backend) - pg: ProcessGroup = ProcessGroup( - prefix_store, - group_rank, - group_size, - options, - ) - if backend == "gloo": - from torch.distributed.distributed_c10d import ProcessGroupGloo - backend_class = ProcessGroupGloo(prefix_store, - group_rank, - group_size, - timeout=timeout) - backend_type = ProcessGroup.BackendType.GLOO - device = torch.device("cpu") - elif backend == "nccl": - assert is_nccl_available() - from torch.distributed.distributed_c10d import ProcessGroupNCCL - - backend_options = ProcessGroupNCCL.Options() - backend_options._timeout = timeout - - backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size, - backend_options) - backend_type = ProcessGroup.BackendType.NCCL - device = torch.device("cuda") - elif backend == "hccl": - from torch.distributed import is_hccl_available - assert is_hccl_available() - from torch_npu._C._distributed_c10d import ProcessGroupHCCL - backend_options = ProcessGroupHCCL.Options() - backend_options._timeout = timeout - backend_class = ProcessGroupHCCL(prefix_store, group_rank, group_size, - backend_options) - device = torch.device("npu") - backend_class._set_sequence_number_for_group() - backend_type = ProcessGroup.BackendType.CUSTOM - pg._register_backend(device, backend_type, backend_class) - return pg - else: - raise RuntimeError(f"Unsupported torch distributed backend: {backend}") - - # TODO(Yizhou): Like we mentioned above, _set_default_backend is not - # implemented in the 2.5.1 version of PyTorch. But we need to set it - # after the latest version is released. - # pg._set_default_backend(backend_type) - backend_class._set_sequence_number_for_group() - - pg._register_backend(device, backend_type, backend_class) - - return pg - - -utils.stateless_init_torch_distributed_process_group = stateless_init_torch_distributed_process_group diff --git a/vllm_ascend/patch/worker/patch_0_9_0/__init__.py b/vllm_ascend/patch/platform/patch_0_9_1/__init__.py similarity index 100% rename from vllm_ascend/patch/worker/patch_0_9_0/__init__.py rename to vllm_ascend/patch/platform/patch_0_9_1/__init__.py diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index d1d3d42f4..3b29856d2 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -18,8 +18,8 @@ from vllm_ascend.utils import vllm_version_is # Import specific patches for different versions -if vllm_version_is("0.9.0"): - from vllm_ascend.patch.worker import patch_0_9_0 # noqa: F401 +if vllm_version_is("0.9.1"): + from vllm_ascend.patch.worker import patch_0_9_1 # noqa: F401 from vllm_ascend.patch.worker import patch_common # noqa: F401 else: from vllm_ascend.patch.worker import patch_common # noqa: F401 diff --git a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py b/vllm_ascend/patch/worker/patch_0_9_1/__init__.py similarity index 90% rename from vllm_ascend/patch/platform/patch_0_9_0/__init__.py rename to vllm_ascend/patch/worker/patch_0_9_1/__init__.py index f0ac16236..116c73c06 100644 --- a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py +++ b/vllm_ascend/patch/worker/patch_0_9_1/__init__.py @@ -14,4 +14,3 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import vllm_ascend.patch.platform.patch_0_9_0.patch_distributed # noqa diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 912e375f7..80211ad13 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -16,7 +16,6 @@ # import gc -import logging import os from datetime import timedelta from typing import TYPE_CHECKING, Optional, Tuple @@ -32,16 +31,6 @@ from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes -CUSTOM_OP_ENABLED = False -try: - # register custom ops into torch_library here - import vllm_ascend.vllm_ascend_C # type: ignore # noqa: F401 - CUSTOM_OP_ENABLED = True -except ImportError as e: - logging.warning( - "Failed to import 'vllm_ascend.vllm_ascend_C': %s. All custom ops will be disabled. ", - e) - if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig from vllm.utils import FlexibleArgumentParser @@ -50,7 +39,6 @@ VllmConfig = None FlexibleArgumentParser = None -os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1" os.environ["ACL_OP_INIT_MODE"] = ascend_envs.VLLM_ASCEND_ACL_OP_INIT_MODE diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index c6e863ff7..66a0a302c 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -15,19 +15,19 @@ # limitations under the License. # -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, Optional, Tuple, Union import torch import torch.distributed as dist import torch_npu -import torchair as tng # type: ignore -from vllm.distributed import GroupCoordinator, tensor_model_parallel_all_reduce +from vllm.distributed import GroupCoordinator import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_ep_group from vllm_ascend.ops.fused_moe import select_experts -from vllm_ascend.utils import dispose_tensor +from vllm_ascend.utils import (dispose_tensor, npu_stream_switch, + npu_wait_tensor) VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2 @@ -39,8 +39,7 @@ def apply_mlp(hidden_states: torch.Tensor, w2_scale: torch.Tensor, group_list: torch.Tensor, dynamic_scale: torch.Tensor = None, - group_list_type: int = 1, - **kwargs) -> torch.Tensor: + group_list_type: int = 1) -> torch.Tensor: """ apply MLP: gate_up_proj -> swiglu -> down_proj @@ -74,23 +73,6 @@ def apply_mlp(hidden_states: torch.Tensor, else: pertoken_scale = dynamic_scale - shared_experts = kwargs.get('shared_experts', None) - if shared_experts: - shared_gate_up = kwargs.get('shared_gate_up', None) - shared_dynamic_scale = kwargs.get('shared_dynamic_scale', None) - with tng.scope.npu_stream_switch('cv'): - tng.scope.npu_wait_tensor(shared_gate_up, hidden_states) - shared_x, shared_dynamic_scale = torch_npu.npu_dequant_swiglu_quant( - x=shared_gate_up, - weight_scale=shared_experts.gate_up_proj.weight_scale_fp32, - activation_scale=shared_dynamic_scale, - bias=None, - quant_scale=None, - quant_offset=None, - group_index=None, - activate_left=True, - quant_mode=1) - # gmm1: gate_up_proj hidden_states = torch_npu.npu_grouped_matmul( x=[hidden_states], @@ -120,36 +102,24 @@ def apply_mlp(hidden_states: torch.Tensor, group_list=group_list, output_dtype=w2_scale.dtype)[0] - if shared_experts: - with tng.scope.npu_stream_switch('cv'): - tng.scope.npu_wait_tensor(shared_x, hidden_states) - shared_output = torch_npu.npu_quant_matmul( - shared_x, - shared_experts.down_proj.weight, - shared_experts.down_proj.weight_scale, - pertoken_scale=shared_dynamic_scale, - output_dtype=torch.bfloat16, - ) - if shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1: - shared_output = tensor_model_parallel_all_reduce(shared_output) - if shared_experts: - return hidden_states, shared_output return hidden_states -def fused_experts_with_mc2(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - w1_scale: torch.Tensor, - w2_scale: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - top_k: int, - expert_map: torch.Tensor = None, - moe_all_to_all_group_name: str = "", - log2phy: torch.Tensor = None, - global_redundant_expert_num: int = 0, - **kwargs) -> torch.Tensor: +def fused_experts_with_mc2( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None, + moe_all_to_all_group_name: str = "", + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: if log2phy: topk_ids = log2phy[topk_ids] global_bs = 0 @@ -188,31 +158,17 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, } kwargs_mc2.update(stage1_kwargs) - shared_experts = kwargs.get('shared_experts', None) - if shared_experts: - shared_hidden_states = kwargs.get('shared_hidden_states', None) - with tng.scope.npu_stream_switch('cv'): - tng.scope.npu_wait_tensor(shared_hidden_states, hidden_states) - shared_x, shared_dynamic_scale = torch_npu.npu_dynamic_quant( - shared_hidden_states) - shared_gate_up = torch_npu.npu_quant_matmul( - shared_x, - shared_experts.gate_up_proj.weight, - shared_experts.gate_up_proj.weight_scale, - output_dtype=torch.int32, - ) - kwargs.update({ - "shared_gate_up": shared_gate_up, - "shared_dynamic_scale": shared_dynamic_scale, - }) - output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2) # comm_stream.wait_stream(torch.npu.current_stream()) expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[ 0:5] - if quant_mode == 0: - dynamic_scale = None + if shared_experts is not None: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(hidden_states, topk_weights) + shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states) + npu_wait_tensor(shared_gate_up[0], expand_x) + shared_act = shared_experts.act_fn(shared_gate_up) # `expand_x` will be disposed in the `apply_mlp` function down_out_list = apply_mlp(expand_x, @@ -221,12 +177,7 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, w2, w2_scale, expert_token_nums, - dynamic_scale=dynamic_scale, - **kwargs) - - multi_stream = isinstance(down_out_list, tuple) - if multi_stream: - down_out_list, shared_output = down_out_list + dynamic_scale=dynamic_scale) # moeCombine kwargs_mc2 = { @@ -257,9 +208,13 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor, hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2) - if multi_stream: + if shared_experts is None: + return hidden_states + else: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(shared_act[0], down_out_list) + shared_output, _ = shared_experts.down_proj(shared_act) return hidden_states, shared_output - return hidden_states # currently expert parallelism implemented with all2all @@ -541,21 +496,33 @@ def get_perchannel_param( @staticmethod def apply( layer: torch.nn.Module, - x: torch.Tensor, + x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], bias: Optional[torch.Tensor] = None, tp_rank: Optional[int] = 0, ) -> torch.Tensor: - original_dtype = x.dtype - # use ATB quantize - quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x) - return torch_npu.npu_quant_matmul( - quant_out, + config = getattr(layer, "_ascend_quant_config", {}) + if not isinstance(x, tuple): + output_dtype = config.get("output_dtype", x.dtype) + quantized_x, dynamic_scale = torch_npu.npu_dynamic_quant(x) + else: + assert "output_dtype" in config.keys(), ( + f"DynamicLinearMethod needs explicitly specified `output_dtype`" + f"for pre-quantized input, got config [{config}]") + output_dtype = config["output_dtype"] + quantized_x, dynamic_scale = x + pertoken_scale = (dynamic_scale + if config.get("pertoken_scale", True) else None) + + output = torch_npu.npu_quant_matmul( + quantized_x, layer.weight, layer.weight_scale, - pertoken_scale=dynamic_scale, + pertoken_scale=pertoken_scale, bias=bias, - output_dtype=original_dtype, + output_dtype=output_dtype, ) + return ((output, dynamic_scale) + if config.get("return_scale", False) else output) def process_weights_after_loading(self, layer): if self.transpose_weight: @@ -650,6 +617,7 @@ def apply( enable_force_load_balance: bool = True, log2phy: torch.Tensor = None, global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, **kwargs, ) -> torch.Tensor: assert router_logits.shape[ @@ -706,7 +674,7 @@ def apply( moe_all_to_all_group_name=self.moe_all_to_all_group_name, log2phy=log2phy, global_redundant_expert_num=global_redundant_expert_num, - **kwargs) + shared_experts=shared_experts) elif self.torchair_graph_enabled or self.ep_group.world_size == 1: return fused_experts(hidden_states=x, w1=layer.w13_weight, diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 7d4093804..6c2c6c608 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -19,17 +19,26 @@ import atexit import math -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext from threading import Lock from typing import TYPE_CHECKING, List, Tuple import torch +import torchair # type: ignore[import] # noqa: F401 from packaging.version import InvalidVersion, Version from torch_npu.npu.streams import Event from vllm.logger import logger import vllm_ascend.envs as envs +try: + # Recent release of torchair has moved these ops to `.scope`. + from torchair.scope import npu_stream_switch as _npu_stream_switch + from torchair.scope import npu_wait_tensor as _npu_wait_tensor +except ImportError: + from torchair.ops import NpuStreamSwitch as _npu_stream_switch + from torchair.ops import npu_wait_tensor as _npu_wait_tensor + if TYPE_CHECKING: from vllm.config import VllmConfig else: @@ -44,6 +53,8 @@ ASCEND_QUATIZATION_METHOD = "ascend" +CUSTOM_OP_ENABLED = None + def try_register_lib(lib_name: str, lib_info: str = ""): import importlib @@ -58,6 +69,31 @@ def try_register_lib(lib_name: str, lib_info: str = ""): pass +def enable_custom_op(): + """ + Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component. + Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device(). + """ + global CUSTOM_OP_ENABLED + + if CUSTOM_OP_ENABLED is not None: + return CUSTOM_OP_ENABLED + + else: + try: + # register custom ops into torch_library here + import vllm_ascend.vllm_ascend_C # type: ignore # noqa: F401 + CUSTOM_OP_ENABLED = True + + except ImportError: + CUSTOM_OP_ENABLED = False + logger.warning( + "Warning: Failed to register custom ops, all custom ops will be disabled" + ) + + return CUSTOM_OP_ENABLED + + def find_hccl_library() -> str: """ We either use the library file specified by the `HCCL_SO_PATH` @@ -227,3 +263,14 @@ def pop_captured_sync(self) -> dict: durations[tag] = observe_start.elapsed_time(observe_end) return durations + + +def npu_stream_switch(tag: str, priority: int, *, enabled: bool = True): + return _npu_stream_switch(tag, priority) if enabled else nullcontext() + + +def npu_wait_tensor(self: torch.Tensor, + dependency: torch.Tensor, + *, + enabled: bool = True): + return _npu_wait_tensor(self, dependency) if enabled else self diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 18d9f3ac5..8dcad248c 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -68,7 +68,7 @@ from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata from vllm_ascend.platform import NPUPlatform from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler -from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is +from vllm_ascend.utils import ProfileExecuteDuration from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer if TYPE_CHECKING: @@ -1066,8 +1066,8 @@ def execute_model( for tag, duration in durations.items() ] captured_name = "Decode" if self.attn_state == AscendAttentionState.DecodeOnly else "Prefill" - print(f"Profile execute duration [{captured_name}]:", - " ".join(dr_str)) + logger.info("Profile execute duration [%s]:%s", captured_name, + " ".join(dr_str)) return model_runner_output @@ -1360,44 +1360,27 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: import torch_npu kv_caches: Dict[str, torch.Tensor] = {} - # Remove this after we drop 0.9.0 support - if vllm_version_is("0.9.0"): - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.model_config.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=True, - vocab_size=self.model_config.get_vocab_size(), - block_size=self.cache_config.block_size, - ) - else: - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.model_config.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=True, - vocab_size=self.model_config.get_vocab_size(), - block_sizes=[self.cache_config.block_size], - ) + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.model_config.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=True, + vocab_size=self.model_config.get_vocab_size(), + block_sizes=[self.cache_config.block_size], + ) - if not vllm_version_is("0.9.0"): - kv_cache_sizes = {} - for kv_cache_tensor in kv_cache_config.kv_cache_tensors: - assert len(kv_cache_tensor.shared_by) == 1, ( - "KV cache tensor shared by multiple layers is not supported in " - "NPU.") - kv_cache_sizes[ - kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size + kv_cache_sizes = {} + for kv_cache_tensor in kv_cache_config.kv_cache_tensors: + assert len(kv_cache_tensor.shared_by) == 1, ( + "KV cache tensor shared by multiple layers is not supported in " + "NPU.") + kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size for kv_cache_group in kv_cache_config.kv_cache_groups: kv_cache_spec = kv_cache_group.kv_cache_spec for layer_name in kv_cache_group.layer_names: - if vllm_version_is("0.9.0"): - tensor_size = kv_cache_config.tensors[layer_name].size - else: - tensor_size = kv_cache_sizes[layer_name] + tensor_size = kv_cache_sizes[layer_name] assert tensor_size % kv_cache_spec.page_size_bytes == 0 num_blocks = tensor_size // kv_cache_spec.page_size_bytes diff --git a/vllm_ascend/worker/multi_step_worker.py b/vllm_ascend/worker/multi_step_worker.py index ba83f6b96..6d092805d 100644 --- a/vllm_ascend/worker/multi_step_worker.py +++ b/vllm_ascend/worker/multi_step_worker.py @@ -119,7 +119,7 @@ def _prepare_last_sampled_token_ids_for_tp_workers( # execute_model_req assert execute_model_req.last_sampled_token_ids is not None model_input.last_sampled_token_ids = ( - execute_model_req.last_sampled_token_ids.cuda()) + execute_model_req.last_sampled_token_ids.npu()) model_input.add_sampler_output( SamplerOutput(outputs=[], sampled_token_ids=None), model_input.last_sampled_token_ids) diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index ba29dcfc5..ebdf01e15 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -117,6 +117,11 @@ def wake_up(self, tags: Optional[list[str]] = None) -> None: allocator = CaMemAllocator.get_instance() allocator.wake_up(tags=tags) + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + def init_device(self): if self.device_config.device.type == "npu": self.device = torch.device(f"npu:{self.local_rank_across_dp}")