diff --git a/.github/workflows/accuracy_report.yaml b/.github/workflows/accuracy_report.yaml
index 1d03e1318..fe1dbd103 100644
--- a/.github/workflows/accuracy_report.yaml
+++ b/.github/workflows/accuracy_report.yaml
@@ -19,110 +19,184 @@ name: Accuracy Report
 on:
   workflow_dispatch:
     inputs:
-      branch:
-        description: 'choose a dev branch to pr'
+      vllm-ascend-branch:
+        description: 'vllm-ascend branch:'
         required: true
-      vllm-ascend-version:
-        description: 'what vllm-ascend version to accuracy test?'
+        type: choice
+        options:
+          - main
+          - v0.7.3-dev
+      models:  
+        description: 'models:'
         required: true
-        type: string
+        type: choice
+        options:
+          - all
+          - Qwen/Qwen2.5-7B-Instruct
+          - Qwen/Qwen2.5-VL-7B-Instruct
+          - Qwen/Qwen3-8B-Base
+        default: 'all'
+
 jobs:
-  download:
+  download_reports:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        model: ${{ fromJSON(
+          (github.event.inputs.models == 'all' &&
+            '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
+          (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' &&
+            '["Qwen/Qwen2.5-7B-Instruct"]') ||
+          (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' &&
+            '["Qwen/Qwen2.5-VL-7B-Instruct"]') ||
+          (github.event.inputs.models == 'Qwen/Qwen3-8B-Base' &&
+            '["Qwen/Qwen3-8B-Base"]')
+         ) }}
+        
+        version: [0, 1]
+        exclude:
+          - model: 'Qwen/Qwen2.5-VL-7B-Instruct'
+            version: 1
+      fail-fast: false     
+
+    name: Download ${{ matrix.model }} V${{ matrix.version }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event.inputs.branch }}
-
-      - name: Debug List Artifacts
-        run: gh api /repos/${{ github.repository }}/actions/artifacts
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          ref: ${{ github.event.inputs.vllm-ascend-branch }}
 
-      - name: Query artifact run id for Qwen2.5-VL-7B-Instruct V0 latest artifact
-        id: get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0
+      - name: Get base model name
+        id: get_basename
         run: |
-          ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts")
-          RUN_ID=$(echo "$ARTIFACT_JSON" | \
-            jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-VL-7B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id')
-          echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT"
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          model_base_name=$(basename "${{ matrix.model }}")
+          echo "model_base_name=$model_base_name" >> $GITHUB_OUTPUT
+        shell: bash
 
-      - name: Query artifact run id for Qwen2.5-7B-Instruct V0 latest artifact
-        id: get_Qwen2_5_7B_Instruct_latest_run_id_V0
+      - name: Query artifact run id
+        id: get_run_id
         run: |
-          ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts")
+          ARTIFACT_PATTERN="${{ github.event.inputs.vllm-ascend-branch }}-${{ steps.get_basename.outputs.model_base_name }}-V${{ matrix.version }}-report"
+          echo "Querying artifacts with pattern: $ARTIFACT_PATTERN"
+          
+          ARTIFACT_JSON=$(gh api --paginate /repos/${{ github.repository }}/actions/artifacts || echo "{}")
+          
           RUN_ID=$(echo "$ARTIFACT_JSON" | \
-            jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-7B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id')
-          echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT"
+            jq -s -r --arg pattern "$ARTIFACT_PATTERN" \
+            '[.[].artifacts[]] | map(select(.name | test($pattern))) | sort_by(.created_at) | last | .workflow_run.id // empty')
+          
+          if [ -z "$RUN_ID" ]; then
+            echo "::warning::No artifact found matching pattern $ARTIFACT_PATTERN. Skipping download."
+            echo "runid=" >> $GITHUB_OUTPUT
+          else
+            echo "Found matching artifact with run ID: $RUN_ID"
+            echo "runid=$RUN_ID" >> $GITHUB_OUTPUT
+          fi
         env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN  }}
 
-      - name: Query artifact run id for Qwen3-8B-Base V0 latest artifact
-        id: get_Qwen3_8B_Base_latest_run_id_V0
-        run: |
-          ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts")
-          RUN_ID=$(echo "$ARTIFACT_JSON" | \
-            jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Qwen3-8B-Base-V0-report")] | sort_by(.created_at) | last | .workflow_run.id')
-          echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT"
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Download Qwen/Qwen2.5-VL-7B-Instruct V0 Artifact
+      - name: Download Artifact
+        if: ${{ steps.get_run_id.outputs.runid != '' }}
         uses: actions/download-artifact@v4
         with:
-          name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-VL-7B-Instruct-V0-report
-          path: ./docs/source/developer_guide/evaluation/accuracy_report
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          repository: vllm-project/vllm-ascend
-          run-id: ${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }}
+          name: ${{ github.event.inputs.vllm-ascend-branch }}-${{ steps.get_basename.outputs.model_base_name }}-V${{ matrix.version }}-report
+          path: ./docs/source/developer_guide/evaluation/accuracy_report_bak
+          github-token: ${{ secrets.GITHUB_TOKEN  }}
+          repository: ${{ github.repository }}
+          run-id: ${{ steps.get_run_id.outputs.runid }}
+          
+      - name: Upload reports artifact
+        if: ${{ steps.get_run_id.outputs.runid != '' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: report-${{ steps.get_basename.outputs.model_base_name }}-v${{ matrix.version }}
+          path: ./docs/source/developer_guide/evaluation/accuracy_report_bak/*.md
+          retention-days: 90
 
-      - name: Download Qwen/Qwen2.5-7B-Instruct Artifact
-        uses: actions/download-artifact@v4
+  create_pr:
+    runs-on: ubuntu-latest
+    needs: download_reports
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
         with:
-          name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen2.5-7B-Instruct-V0-report
-          path: ./docs/source/developer_guide/evaluation/accuracy_report
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          repository: vllm-project/vllm-ascend
-          run-id: ${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }}
+          ref: ${{ github.event.inputs.vllm-ascend-branch }}
+
+      - name: Setup workspace
+        run: mkdir -p ./accuracy/accuracy_report
 
-      - name: Download Qwen/Qwen3-8B-Base Artifact
+      - name: Download only current run reports
         uses: actions/download-artifact@v4
         with:
-          name: ${{ github.event.inputs.vllm-ascend-version }}-Qwen3-8B-Base-V0-report
           path: ./docs/source/developer_guide/evaluation/accuracy_report
+          pattern: report-*
           github-token: ${{ secrets.GITHUB_TOKEN }}
-          repository: vllm-project/vllm-ascend
-          run-id: ${{ steps.get_Qwen3_8B_Base_latest_run_id_V0.outputs.runid }}
+          run-id: ${{ github.run_id }}
+
+      - name: Delete old report
+        run: |
+          find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete
+          find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \;
+          find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete
 
-      - name: Display Files
-        working-directory: ./docs/source/developer_guide/evaluation/accuracy_report
+      - name: Generate step summary
+        if: ${{ always() }}
         run: |
-          cat ./Qwen2.5-VL-7B-Instruct.md
-          cat ./Qwen2.5-7B-Instruct.md
-          cat ./Qwen3-8B-Base.md
-      
-      - name: Create Pull Request for markdown update
+          for report in ./docs/source/developer_guide/evaluation/accuracy_report/*.md; do
+            filename=$(basename "$report")
+            # skip index.md
+            if [ "$filename" = "index.md" ]; then
+              continue
+            fi
+
+            if [ -f "$report" ]; then
+              {
+                echo -e "\n\n---\n"
+                echo "## 📄 Report File: $(basename $report)"
+                cat "$report"
+              } >> "$GITHUB_STEP_SUMMARY"
+            fi
+          done
+
+      - name: Update accuracy_report/index.md
+        run: |
+          REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report"
+          INDEX_MD="$REPORT_DIR/index.md"
+
+          {
+            echo "# Accuracy Report"
+            echo ""
+            echo "::: {toctree}"
+            echo ":caption: Accuracy Report"
+            echo ":maxdepth: 1"
+            
+            for report in "$REPORT_DIR"/*.md; do
+              filename="$(basename "$report" .md)"
+              if [ "$filename" != "index" ]; then
+                echo "$filename"
+              fi
+            done
+
+            echo ":::"
+          } > "$INDEX_MD"
+
+      - name: Create Pull Request
         uses: peter-evans/create-pull-request@v7
         with:
           token: ${{ secrets.PR_TOKEN }}
-          base: ${{ github.event.inputs.branch }}
-          branch: auto-pr/accuracy-test
-          commit-message: "Update accuracy report for ${{ github.event.inputs.branch }}"
+          base: ${{ github.event.inputs.vllm-ascend-branch }}
+          branch: auto-pr/accuracy-report
+          commit-message: "Update accuracy reports for ${{ github.event.inputs.vllm-ascend-branch }}"
           add-paths: ./docs/source/developer_guide/evaluation/accuracy_report/*.md
-          title: "[Doc]Update accuracy report for ${{ github.event.inputs.branch }}"
+          title: "[Doc] Update accuracy reports for ${{ github.event.inputs.vllm-ascend-branch }}"
           body: |
-            The accuracy results running on Ascend NPU have changed, I'm updating the report.
-            Please review the changes.
-
+            The accuracy results running on NPU Altlas A2 have changed, updating reports for:
+            ${{ 
+              github.event.inputs.models == 'all' 
+                && 'All models (Qwen2.5-7B-Instruct, Qwen2.5-VL-7B-Instruct, Qwen3-8B-Base)' 
+                || github.event.inputs.models 
+            }}
+            
             - [Workflow run][1]
-            - [Qwen2.5-7B-Instruct accuracy report][2]
-            - [Qwen2.5-VL-7B-Instruct accuracy report][3]
-            - [Qwen3-8B-Base accuracy report][4]
-
-            [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
-            [2]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }}
-            [3]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }}
-            [4]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen3_8B_Base_latest_run_id_V0.outputs.runid }}
+            
+            [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
\ No newline at end of file
diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml
index d0a7debaa..999fb6ad5 100644
--- a/.github/workflows/accuracy_test.yaml
+++ b/.github/workflows/accuracy_test.yaml
@@ -34,8 +34,7 @@ on:
         # Current supported vLLM versions
         options:
           - main
-          - v0.9.0.1
-          - v0.9.0
+          - v0.9.1
           - v0.7.3
       vllm-ascend-version:
         description: 'vllm-ascend version:'
@@ -96,7 +95,7 @@ jobs:
         #   - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct
         model_name: ${{ fromJSON(
           (github.event.inputs.models == 'all' &&
-            '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","model_name":"Qwen/Qwen3-8B-Base"]') ||
+            '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
           (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' &&
             '["Qwen/Qwen2.5-7B-Instruct"]') ||
           (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' &&
@@ -159,7 +158,7 @@ jobs:
           repository: vllm-project/vllm
           path: ./vllm-empty
           # Please also update this when bump matched version
-          ref: ${{ github.event.inputs.vllm-version || 'v0.9.0' }}
+          ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }}
 
       - name: Install vllm-project/vllm from source
         working-directory: ./vllm-empty
@@ -201,6 +200,7 @@ jobs:
             pip show torch | grep "Version:" | awk '{print "GHA_TORCH_VERSION="$2}'
             pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}'
             pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//'
+            echo "GHA_VLLM_ASCEND_VERSION=${{ github.event.inputs.vllm-ascend-version || github.ref }}"
           } >> "$GITHUB_ENV"
       
       - name: Print versions
@@ -209,7 +209,7 @@ jobs:
           echo "Torch NPU: ${{ env.GHA_TORCH_NPU_VERSION }}"
           echo "Torch: ${{ env.GHA_TORCH_VERSION }}"
           echo "vLLM: ${{ env.GHA_VLLM_VERSION }}"
-          echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}"
+          echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION }}"
 
       - name: Run Accuracy Test for V${{ matrix.vllm_use_version }}
         id: report
@@ -238,10 +238,16 @@ jobs:
         run: |
           cat ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md >> $GITHUB_STEP_SUMMARY
 
+      - name: Sanitize version string for artifact naming
+        run: |
+          SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}"
+          echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV"
+
       - name: Upload Report for V${{ matrix.vllm_use_version }}
+        if: ${{ github.event_name == 'workflow_dispatch' }}
         uses: actions/upload-artifact@v4
         with:
-          name: "${{ env.GHA_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}-report"
+          name: "${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}-report"
           path: ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md
           if-no-files-found: warn
           retention-days: 90
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
deleted file mode 100644
index 91cd9c412..000000000
--- a/.github/workflows/actionlint.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-#
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Adapted from vllm-project/vllm/blob/main/.github
-#
-
-name: Lint GitHub Actions workflows
-on:
-  pull_request:
-    branches:
-      - 'main'
-      - '*-dev'
-    paths:
-      - '.github/workflows/*.ya?ml'
-      - '.github/workflows/actionlint.*'
-      - '.github/workflows/matchers/actionlint.json'
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  actionlint:
-    runs-on: ubuntu-latest
-    steps:
-      - name: "Checkout"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: "Run actionlint"
-        env:
-          SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
-        run: |
-          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
-          tools/actionlint.sh -color
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
deleted file mode 100644
index a239d38fa..000000000
--- a/.github/workflows/codespell.yml
+++ /dev/null
@@ -1,47 +0,0 @@
-#
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Adapted from vllm-project/vllm/blob/main/.github
-#
-
-name: codespell
-
-on:
-  pull_request:
-    branches:
-      - 'main'
-      - '*-dev'
-
-jobs:
-  codespell:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements-lint.txt
-      - name: Run codespell check
-        run: |
-          CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**')
-          CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn')
-
-          codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}"
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
deleted file mode 100644
index 81cd69afe..000000000
--- a/.github/workflows/mypy.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-#
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Adapted from vllm-project/vllm/blob/main/.github
-#
-
-name: mypy
-
-on:
-  pull_request:
-    branches:
-      - 'main'
-      - '*-dev'
-    # This workflow is only relevant when one of the following files changes.
-    # However, we have github configured to expect and require this workflow
-    # to run and pass before github with auto-merge a pull request. Until github
-    # allows more flexible auto-merge policy, we can just run this on every PR.
-    # It doesn't take that long to run, anyway.
-    paths:
-     - '**/*.py'
-     - '.github/workflows/mypy.yaml'
-     - 'tools/mypy.sh'
-     - 'mypy.ini'
-
-jobs:
-  mypy:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        # TODO(yikun): Add 3.12 back when torch-npu support 3.12
-        python-version: ["3.9", "3.10", "3.11"]
-    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu
-
-    - name: Checkout vllm-project/vllm repo
-      uses: actions/checkout@v4
-      with:
-        repository: vllm-project/vllm
-        path: vllm-empty
-
-    - name: Install vllm-project/vllm from source
-      working-directory: vllm-empty
-      run: |
-        pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-        VLLM_TARGET_DEVICE=empty pip install .
-
-    - name: Mypy
-      run: |
-        echo "::add-matcher::.github/workflows/matchers/mypy.json"
-        tools/mypy.sh 1 ${{ matrix.python-version }}
diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml
index 67f8f9096..116912111 100644
--- a/.github/workflows/nightly_benchmarks.yaml
+++ b/.github/workflows/nightly_benchmarks.yaml
@@ -41,13 +41,18 @@ jobs:
   test:
     if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
 
-    name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}
+    name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}, use_v1=${{ matrix.vllm_use_v1 }}
     runs-on: 'linux-arm64-npu-static-8'
     strategy:
       matrix:
         include:
+          - vllm_branch: v0.9.1
+            vllm_ascend_branch: main
+            vllm_use_v1: 0
           - vllm_branch: v0.9.0
             vllm_ascend_branch: main
+            vllm_use_v1: 1
+      max-parallel: 1
     container:
       image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
       volumes:
@@ -67,6 +72,7 @@ jobs:
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
         ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }}
         ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }}
+        VLLM_USE_V1: ${{ matrix.vllm_use_v1 }}
     steps:
       - name: Check npu and CANN info
         run: |
@@ -136,7 +142,7 @@ jobs:
       - name: Install elastic_tool
         if: github.event_name != 'pull_request'
         run: |
-          pip install escli-tool==0.2.1
+          pip install escli-tool==0.2.2
           
       - name: Collect pr info from vllm-project/vllm-ascend
         if: github.event_name != 'pull_request'
@@ -160,10 +166,10 @@ jobs:
           while IFS= read -r line || [[ -n "$line" ]]; do
             commit_id=${line%% *}
             commit_title=${line#* }
-            commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict)
-            commit_time_no_tz=${commit_time::19}
 
             git checkout $commit_id
+            commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict)
+            commit_time_no_tz=${commit_time::19}
             pip install -e .
             
             echo "------------------------"
@@ -173,17 +179,17 @@ jobs:
             echo "vllm branch: ${{ matrix.vllm_branch }}"
             echo "vllm-ascend branch: ${{ matrix.vllm_ascend_branch }}"
             echo "------------------------"
+            
             cd /github/home
             bash benchmarks/scripts/run-performance-benchmarks.sh
             # send the result to es
-            if [[ "${{ github.event_name }}" != "pull request" ]]; then
-              escli add --vllm_branch ${{ matrix.vllm_branch }} \
-              --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \
-              --commit_id $commit_id \
-              --commit_title "$commit_title" \
-              --created_at "$commit_time_no_tz" \
-              --res_dir ./benchmarks/results 
-              rm -rf ./benchmarks/results
-            fi
+            escli add --vllm_branch ${{ matrix.vllm_branch }} \
+            --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \
+            --commit_id $commit_id \
+            --commit_title "$commit_title" \
+            --created_at "$commit_time_no_tz" \
+            --res_dir ./benchmarks/results \
+            --extra_feat '{"VLLM_USE_V1": "${{ matrix.vllm_use_v1 }}"}'
+            rm -rf ./benchmarks/results
             cd -
           done < commit_log.txt
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
deleted file mode 100644
index a19e6a89d..000000000
--- a/.github/workflows/ruff.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-#
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Adapted from vllm-project/vllm/blob/main/.github
-#
-
-name: ruff
-
-on:
-  pull_request:
-    branches:
-      - 'main'
-      - '*-dev'
-
-jobs:
-  ruff:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements-lint.txt
-      - name: Analysing the code with ruff
-        run: |
-          echo "::add-matcher::.github/workflows/matchers/ruff.json"
-          ruff check --output-format github .
-      - name: Run isort
-        run: |
-          isort . --check-only
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index 49a85f5be..9245820ff 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -15,7 +15,7 @@
 # This file is a part of the vllm-ascend project.
 #
 
-name: 'e2e test'
+name: 'test'
 
 on:
   pull_request:
@@ -29,6 +29,12 @@ on:
       - '!docs/**'
       - 'pytest.ini'
       - '!benchmarks/**'
+      - 'tools/mypy.sh'
+      - 'mypy.ini'
+      - '.github/workflows/*.ya?ml'
+      - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
+
 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
 # declared as "shell: bash -el {0}" on steps that need to be properly activated.
 # It's used to activate ascend-toolkit environment variables.
@@ -37,12 +43,77 @@ defaults:
     shell: bash -el {0}
 
 jobs:
-  test:
+  lint:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-lint.txt
+      - name: Run codespell check
+        run: |
+          CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**')
+          CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn')
+
+          codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}"
+      - name: Analysing the code with ruff
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/ruff.json"
+          ruff check --output-format github .
+      - name: Run isort
+        run: |
+          isort . --check-only
+      - name: Running yapf
+        run: |
+          python -m pip install --upgrade pip
+          pip install toml
+          pip install yapf==0.32.0
+          yapf --diff --recursive .
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu
+
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          path: vllm-empty
+
+      - name: Actionlint Check
+        env:
+          SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+          tools/actionlint.sh -color
+
+      - name: Install vllm-project/vllm from source
+        working-directory: vllm-empty
+        run: |
+          pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+          VLLM_TARGET_DEVICE=empty pip install .
+
+      - name: Mypy Check
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/mypy.json"
+          tools/mypy.sh 1 ${{ matrix.python-version }}
+
+  e2e:
+    needs: [lint]
+    if: ${{ needs.lint.result == 'success' }}
     strategy:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_version: [v0.9.0, v0.9.1]
+        vllm_version: [v0.9.1]
     concurrency:
       group: >
         ${{
@@ -107,34 +178,37 @@ jobs:
         run: |
           if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
             VLLM_USE_MODELSCOPE=True pytest -sv tests/singlecard/test_offline_inference.py
-            pytest -sv tests/singlecard/test_scheduler.py
             # guided decoding doesn't work, fix it later
             # pytest -sv tests/singlecard/test_guided_decoding.py.py
             # test_ascend_config.py should be ran separately because it will regenerate the global config many times.
             pytest -sv tests/singlecard/test_ascend_config.py
             pytest -sv tests/singlecard/test_camem.py
+            pytest -sv tests/singlecard/core/test_ascend_scheduler.py
+            pytest -sv tests/singlecard/core/test_ascend_scheduler_e2e.py
             pytest -sv tests/singlecard/ \
             --ignore=tests/singlecard/test_offline_inference.py \
-            --ignore=tests/singlecard/test_scheduler.py \
             --ignore=tests/singlecard/test_guided_decoding.py \
             --ignore=tests/singlecard/test_ascend_config.py \
-            --ignore=tests/singlecard/test_camem.py
+            --ignore=tests/singlecard/test_camem.py \
+            --ignore=tests/singlecard/core/test_ascend_scheduler.py \
+            --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
           else
             pytest -sv tests/multicard/test_ilama_lora_tp2.py
             # To avoid oom, we need to run the test in a single process.
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
           fi
 
       - name: Run vllm-project/vllm-ascend test on V0 engine
+        if: ${{ github.event_name == 'schedule' }}
         env:
           VLLM_USE_V1: 0
         run: |
           if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
             VLLM_USE_MODELSCOPE=True  pytest -sv tests/singlecard/test_offline_inference.py
-            pytest -sv tests/singlecard/test_scheduler.py
             # guided decoding doesn't work, fix it later
             # pytest -sv tests/singlecard/test_guided_decoding.py.py
             pytest -sv tests/singlecard/test_camem.py
@@ -143,11 +217,12 @@ jobs:
             pytest -sv tests/singlecard/test_prompt_embedding.py
             pytest -sv tests/singlecard/ \
               --ignore=tests/singlecard/test_offline_inference.py \
-              --ignore=tests/singlecard/test_scheduler.py \
               --ignore=tests/singlecard/test_guided_decoding.py \
               --ignore=tests/singlecard/test_camem.py \
               --ignore=tests/singlecard/test_ascend_config.py \
-              --ignore=tests/singlecard/test_prompt_embedding.py
+              --ignore=tests/singlecard/test_prompt_embedding.py \
+              --ignore=tests/singlecard/core/test_ascend_scheduler.py \
+              --ignore=tests/singlecard/core/test_ascend_scheduler_e2e.py
           else
             pytest -sv tests/multicard/test_ilama_lora_tp2.py
             # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py will raise error.
@@ -155,5 +230,6 @@ jobs:
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py
           fi
diff --git a/.github/workflows/vllm_ascend_test_long_term.yaml b/.github/workflows/vllm_ascend_test_long_term.yaml
index 8afb95048..7ac0a825b 100644
--- a/.github/workflows/vllm_ascend_test_long_term.yaml
+++ b/.github/workflows/vllm_ascend_test_long_term.yaml
@@ -40,7 +40,7 @@ jobs:
       max-parallel: 2
       matrix:
         os: [linux-arm64-npu-1, linux-arm64-npu-4]
-        vllm_version: [v0.9.0, v0.9.1]
+        vllm_version: [v0.9.1]
     name: vLLM Ascend long term test
     runs-on: ${{ matrix.os }}
     container:
@@ -94,7 +94,8 @@ jobs:
           if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
             # spec decode test
             VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_spec_decode.py
+            # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
+            # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_v1_spec_decode.py
             VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode/e2e/test_mtp_correctness.py  # it needs a clean process
             pytest -sv tests/long_term/spec_decode --ignore=tests/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
             pytest -sv tests/long_term/test_accuracy.py
diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml
index 0de616d4f..e44d60704 100644
--- a/.github/workflows/vllm_ascend_test_pd.yaml
+++ b/.github/workflows/vllm_ascend_test_pd.yaml
@@ -38,7 +38,7 @@ jobs:
     if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
     strategy:
       matrix:
-        vllm_verison: [v0.9.0, v0.9.1]
+        vllm_verison: [v0.9.1]
     name: vLLM Ascend prefilling decoding disaggregation test
     runs-on: linux-arm64-npu-static-8
 
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
deleted file mode 100644
index 64497d156..000000000
--- a/.github/workflows/yapf.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-#
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Adapted from vllm-project/vllm/blob/main/.github
-#
-
-name: yapf
-
-on:
-  pull_request:
-    branches:
-      - 'main'
-      - '*-dev'
-    paths:
-      - "**/*.py"
-      - .github/workflows/yapf.yml
-
-jobs:
-  yapf:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install toml
-          pip install yapf==0.32.0
-      - name: Running yapf
-        run: |
-          yapf --diff --recursive .
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a2c3ad2e1..8d06c75f2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,5 +96,3 @@ target_link_libraries(
 target_link_options(vllm_ascend_C PRIVATE "-Wl,-rpath,$ORIGIN:$ORIGIN/lib")
 
 install(TARGETS vllm_ascend_C vllm_ascend_kernels DESTINATION ${VLLM_ASCEND_INSTALL_PATH})
-
-
diff --git a/Dockerfile b/Dockerfile
index 1dfd10c86..952e77fe9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.0
+ARG VLLM_TAG=v0.9.1
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
index ffd1174d2..2ff3d0b39 100644
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_TAG=v0.9.0
+ARG VLLM_TAG=v0.9.1
 
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
diff --git a/benchmarks/ops/ben_vocabparallelembedding.py b/benchmarks/ops/ben_vocabparallelembedding.py
new file mode 100644
index 000000000..e91cfed7b
--- /dev/null
+++ b/benchmarks/ops/ben_vocabparallelembedding.py
@@ -0,0 +1,144 @@
+from typing import Tuple
+
+import numpy as np
+import pytest
+import torch
+import torch_npu  # noqa: F401
+import vllm  # noqa: F401
+
+import vllm_ascend.platform  # noqa: F401
+
+
+def benchmark_npu(fn, num_iterations=100, num_warmup_iterations=50):
+    """
+    Benchmark function for NPU operations
+    
+    Args:
+        fn: Function to benchmark
+        num_iterations: Number of timing iterations
+        num_warmup_iterations: Number of warmup iterations
+    
+    Returns:
+        float: Minimum elapsed time in seconds
+    """
+    start = torch.npu.Event(enable_timing=True)
+    end = torch.npu.Event(enable_timing=True)
+    times = np.zeros(num_iterations + num_warmup_iterations)
+
+    # Run iterations
+    for i in range(num_warmup_iterations + num_iterations):
+        with torch.no_grad():
+            start.record()
+            fn()  # Execute the function
+            end.record()
+        torch.npu.synchronize()
+        times[i] = start.elapsed_time(end)
+
+    # Remove warmup iterations and convert to seconds
+    times = times[num_warmup_iterations:]
+    elapsed_time = np.amin(times) / 1000
+    return elapsed_time
+
+
+def get_masked_input_and_mask_ref(
+        input_: torch.Tensor, org_vocab_start_index: int,
+        org_vocab_end_index: int, num_org_vocab_padding: int,
+        added_vocab_start_index: int,
+        added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Reference implementation for verification"""
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
+                                                          org_vocab_end_index)
+    added_vocab_mask = (input_ >= added_vocab_start_index) & (
+        input_ < added_vocab_end_index)
+    added_offset = added_vocab_start_index - (
+        org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
+    valid_offset = (org_vocab_start_index *
+                    org_vocab_mask) + (added_offset * added_vocab_mask)
+    vocab_mask = org_vocab_mask | added_vocab_mask
+    masked_input = vocab_mask * (input_ - valid_offset)
+    return masked_input, ~vocab_mask
+
+
+DTYPES = [torch.int32]
+SHAPES = [(3, 4, 5)]
+DEVICES = [f"npu:{0}"]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_get_masked_input_and_mask(
+    shape: Tuple[int, ...],
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+) -> None:
+    # Set random seed and device
+    torch.manual_seed(seed)
+    torch.set_default_device(device)
+
+    # Generate random input tensor
+    input_tensor = torch.randint(0, 1000, shape, dtype=dtype)
+
+    # Test parameters
+    test_case = {
+        "org_start": 100,
+        "org_end": 200,
+        "padding": 0,
+        "added_start": 300,
+        "added_end": 400,
+    }
+
+    # Define reference function
+    def ref_fn():
+        return get_masked_input_and_mask_ref(input_tensor,
+                                             test_case["org_start"],
+                                             test_case["org_end"],
+                                             test_case["padding"],
+                                             test_case["added_start"],
+                                             test_case["added_end"])
+
+    # Define custom function
+    def custom_fn():
+        return torch.ops._C.get_masked_input_and_mask(input_tensor,
+                                                      test_case["org_start"],
+                                                      test_case["org_end"],
+                                                      test_case["padding"],
+                                                      test_case["added_start"],
+                                                      test_case["added_end"])
+
+    # Get results for correctness testing
+    ref_masked_input, ref_mask = ref_fn()
+    custom_masked_input, custom_mask = custom_fn()
+
+    # Benchmark both implementations
+    ref_time = benchmark_npu(ref_fn)
+    custom_time = benchmark_npu(custom_fn)
+
+    # Print performance results
+    print("\nPerformance Results:")
+    print(f"Reference implementation: {ref_time*1000:.3f} ms")
+    print(f"Custom implementation: {custom_time*1000:.3f} ms")
+    print(f"Speedup: {ref_time/custom_time:.2f}x")
+
+    # Compare results for correctness
+    ref_masked_input = ref_masked_input.to(dtype)
+    print("\nResults comparison:")
+    print("custom_masked_input:", custom_masked_input)
+    print("ref_masked_input:", ref_masked_input)
+    print("custom_mask:", custom_mask)
+    print("ref_mask:", ref_mask)
+    torch.testing.assert_close(
+        custom_masked_input,
+        ref_masked_input,
+        rtol=1e-5,
+        atol=1e-5,
+        msg=f"Masked input mismatch for case: {test_case}")
+    torch.testing.assert_close(custom_mask,
+                               ref_mask,
+                               rtol=1e-5,
+                               atol=1e-5,
+                               msg=f"Mask mismatch for case: {test_case}")
diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py
index f508feda7..79c58bcbc 100644
--- a/benchmarks/scripts/run_accuracy.py
+++ b/benchmarks/scripts/run_accuracy.py
@@ -26,7 +26,7 @@
 import lm_eval
 import torch
 
-UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B"]
+UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B-Base"]
 UNIMODAL_TASK = ["ceval-valid", "gsm8k"]
 MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"]
 MULTIMODAL_TASK = ["mmmu_val"]
@@ -36,17 +36,17 @@
 MODEL_RUN_INFO = {
     "Qwen/Qwen2.5-7B-Instruct":
     ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
-     "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
+     "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
      "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
      ),
     "Qwen/Qwen3-8B-Base":
     ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
-     "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
+     "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
      "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
      ),
     "Qwen/Qwen2.5-VL-7B-Instruct":
     ("export MODEL_ARGS='pretrained={model}, max_model_len=8192,dtype=auto,tensor_parallel_size=4,max_images=2'\n"
-     "lm_eval --model vllm-vlm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
+     "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
      "--apply_chat_template --fewshot_as_multiturn  --batch_size 1"),
 }
 
diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json
index 576ced213..40cec4c5f 100644
--- a/benchmarks/tests/latency-tests.json
+++ b/benchmarks/tests/latency-tests.json
@@ -9,5 +9,15 @@
       "num_iters_warmup": 5,
       "num_iters": 15
     }
+  },
+  {
+    "test_name": "latency_qwen2_5_7B_tp1",
+    "parameters": {
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "tensor_parallel_size": 1,
+      "load_format": "dummy",
+      "num_iters_warmup": 5,
+      "num_iters": 15
+    }
   }
 ]
diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json
index d8ad2be2b..c8d5cdaf9 100644
--- a/benchmarks/tests/serving-tests.json
+++ b/benchmarks/tests/serving-tests.json
@@ -49,5 +49,29 @@
       "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
       "num_prompts": 200
     }
+  },
+  {
+    "test_name": "serving_qwen2_5_7B_tp1",
+    "qps_list": [
+      1,
+      4,
+      16,
+      "inf"
+    ],
+    "server_parameters": {
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "tensor_parallel_size": 1,
+      "swap_space": 16,
+      "disable_log_stats": "",
+      "disable_log_requests": "",
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "backend": "vllm",
+      "dataset_name": "sharegpt",
+      "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
+      "num_prompts": 200
+    }
   }
 ]
diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json
index 551d23882..3698e69f3 100644
--- a/benchmarks/tests/throughput-tests.json
+++ b/benchmarks/tests/throughput-tests.json
@@ -22,6 +22,17 @@
       "dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
       "num_prompts": 200
     }
+  },
+  {
+    "test_name": "throughput_qwen2_5_7B_tp1",
+    "parameters": {
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "tensor_parallel_size": 1,
+      "load_format": "dummy",
+      "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
+      "num_prompts": 200,
+      "backend": "vllm"
+    }
   }
 ]
 
diff --git a/csrc/kernels/get_masked_input_and_mask_kernel.cpp b/csrc/kernels/get_masked_input_and_mask_kernel.cpp
new file mode 100644
index 000000000..47ce826f2
--- /dev/null
+++ b/csrc/kernels/get_masked_input_and_mask_kernel.cpp
@@ -0,0 +1,345 @@
+/* 
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+#include "kernel_tensor_impl.h"
+#include "kernel_type.h"
+#include "types.h"
+#include "utils.h"
+using vllm_ascend::AccType;
+
+template<typename scalar_t>
+class GetMaskedInputAndMask {
+public:
+    __aicore__ inline GetMaskedInputAndMask() {}
+    
+    __aicore__ inline ~GetMaskedInputAndMask() {
+        pipe.Reset();
+    }
+
+    
+    __aicore__ inline void Init(
+        __gm__ scalar_t* input,
+        __gm__ scalar_t* masked_input, 
+        __gm__ bool* mask_out,
+        const int64_t org_vocab_start_index,
+        const int64_t org_vocab_end_index,
+        const int64_t num_org_vocab_padding,
+        const int64_t added_vocab_start_index,
+        const int64_t added_vocab_end_index,
+        const int64_t size)
+    {
+        // Initialize basic parameters
+        input_ = input;
+        masked_input_ = masked_input;
+        mask_out_ = mask_out;
+        org_vocab_start_index_ = org_vocab_start_index;
+        org_vocab_end_index_ = org_vocab_end_index;
+        size_ = ((size + 31) / 32) * 32;
+        added_offset_ = added_vocab_start_index - 
+            (org_vocab_end_index - org_vocab_start_index) - 
+            num_org_vocab_padding;
+        added_vocab_start_index_ = added_vocab_start_index;
+        added_vocab_end_index_ = added_vocab_end_index;
+
+        // Initialize global tensors
+        inputGlobal.SetGlobalBuffer(input);
+        maskedOutputGlobal.SetGlobalBuffer(masked_input); 
+        maskOutGlobal.SetGlobalBuffer(mask_out);
+
+        // Initialize queues
+        pipe.InitBuffer(inQueue, 1, size_ * sizeof(scalar_t));
+        pipe.InitBuffer(outQueue, 1, size_ * sizeof(scalar_t));
+        pipe.InitBuffer(maskQueue, 1, size_ * sizeof(bool));
+        
+        // Initialize calculation buffers
+        pipe.InitBuffer(calc_buf_1, size_ * sizeof(float));
+        pipe.InitBuffer(calc_buf_2, size_ * sizeof(float));
+        
+        // Initialize result queues
+        pipe.InitBuffer(result_ge_que, BUFFER_NUM, size_ * sizeof(float));
+        pipe.InitBuffer(result_le_que, BUFFER_NUM, size_ * sizeof(float));
+        pipe.InitBuffer(result_org_mask_que, BUFFER_NUM, size_ * sizeof(float));
+        pipe.InitBuffer(result_add_mask_que, BUFFER_NUM, size_ * sizeof(float));
+
+        // Initialize temporary buffers
+        pipe.InitBuffer(start_buf, size_ * sizeof(float));
+        pipe.InitBuffer(end_buf, size_ * sizeof(float));
+        pipe.InitBuffer(inputFloat_buf, size_ * sizeof(float));
+        pipe.InitBuffer(validOffset_buf, size_ * sizeof(float));
+        pipe.InitBuffer(vocabMask_buf_, size_ * sizeof(int8_t));
+        pipe.InitBuffer(ones_buf_, size_ * sizeof(float));
+    }
+
+    __aicore__ inline void Process()
+    {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn()
+    {
+        AscendC::LocalTensor<scalar_t> inputLocal = inQueue.AllocTensor<scalar_t>();
+        AscendC::DataCopy(inputLocal, inputGlobal, size_);
+        inQueue.EnQue(inputLocal);
+    }
+
+    __aicore__ inline void CompareWithValue(
+        AscendC::LocalTensor<int8_t>& result,
+        const AscendC::LocalTensor<float>& input,
+        const AscendC::LocalTensor<float>& compare_value,
+        bool is_greater_equal) {
+
+        AscendC::LocalTensor<float> compute_buf = calc_buf_1.Get<float>();
+        if (is_greater_equal) {
+            AscendC::Max(compute_buf, input, compare_value, size_);  
+            AscendC::Sub(compute_buf, compare_value, compute_buf, size_);  
+        } else {
+            AscendC::Max(compute_buf, input, compare_value, size_); 
+            AscendC::Sub(compute_buf, compute_buf, compare_value, size_); 
+        }
+
+        AscendC::Abs(compute_buf, compute_buf, size_);
+        AscendC::Mins(compute_buf, compute_buf, MIN_ACCURACY_FP32, size_);
+        AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_);
+        AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_);
+        AscendC::Muls(compute_buf, compute_buf, MAX_MUL_2_FP32, size_);
+        AscendC::Adds(compute_buf, compute_buf, NEGATIVE_ONE_FP32, size_);
+        AscendC::Abs(compute_buf, compute_buf, size_);
+
+        AscendC::LocalTensor<half> compute_buf_fp16 = calc_buf_2.Get<half>();
+        AscendC::Cast(compute_buf_fp16, compute_buf, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(result, compute_buf_fp16, AscendC::RoundMode::CAST_NONE, size_);
+    }
+
+    __aicore__ inline void ComputeRangeMask(
+        AscendC::LocalTensor<int8_t>& range_mask,
+        const AscendC::LocalTensor<float>& input,
+        const float start_value, 
+        const float end_value) {
+        
+        // Use already initialized buffers
+        AscendC::LocalTensor<float> start_value_tensor = start_buf.Get<float>();
+        AscendC::LocalTensor<float> end_value_tensor = end_buf.Get<float>();
+
+        AscendC::Duplicate(start_value_tensor, start_value, size_);
+        AscendC::Duplicate(end_value_tensor, end_value, size_);
+        
+        AscendC::LocalTensor<int8_t> ge_result = result_ge_que.AllocTensor<int8_t>();
+        AscendC::LocalTensor<int8_t> lt_result = result_le_que.AllocTensor<int8_t>();
+
+        CompareWithValue(ge_result, start_value_tensor, input, true);
+        CompareWithValue(lt_result, input, end_value_tensor, false);
+        
+        AscendC::And(range_mask, ge_result, lt_result, size_);
+    }
+
+    __aicore__ inline void Compute() {
+        AscendC::LocalTensor<scalar_t> inputLocal = inQueue.DeQue<scalar_t>();
+        AscendC::LocalTensor<scalar_t> maskedLocal = outQueue.AllocTensor<scalar_t>();
+        AscendC::LocalTensor<int8_t> maskLocal = maskQueue.AllocTensor<int8_t>();
+
+        AscendC::LocalTensor<float> inputFloat = inputFloat_buf.Get<float>();
+        AscendC::Cast(inputFloat, inputLocal, AscendC::RoundMode::CAST_NONE, size_);
+
+        // Calculate mask for org_vocab range
+        // org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index)
+        AscendC::LocalTensor<int8_t> orgVocabMask = result_org_mask_que.AllocTensor<int8_t>();
+        ComputeRangeMask(orgVocabMask, 
+                        inputFloat,
+                        static_cast<float>(org_vocab_start_index_),
+                        static_cast<float>(org_vocab_end_index_));
+
+        // Calculate mask for added_vocab range
+        // added_vocab_mask = (input_ >= added_vocab_start_index) & (input_ < added_vocab_end_index)
+        AscendC::LocalTensor<int8_t> addedVocabMask = result_add_mask_que.AllocTensor<int8_t>();
+        ComputeRangeMask(addedVocabMask,
+                        inputFloat,
+                        static_cast<float>(added_vocab_start_index_),
+                        static_cast<float>(added_vocab_end_index_));
+
+        // Calculate validOffset
+        // valid_offset = (org_vocab_start_index * org_vocab_mask) + (added_offset * added_vocab_mask)
+        AscendC::LocalTensor<float> validOffset = validOffset_buf.Get<float>();
+        AscendC::LocalTensor<float> constOrgStartIndex = start_buf.Get<float>();
+        
+        AscendC::Duplicate(constOrgStartIndex, float(org_vocab_start_index_), size_);
+        
+        AscendC::LocalTensor<half> orgVocabMask_fp16;
+        AscendC::LocalTensor<float> orgVocabMask_fp32;
+        AscendC::Cast(orgVocabMask_fp16, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(orgVocabMask_fp32, orgVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
+
+        AscendC::Mul(validOffset, 
+            constOrgStartIndex,
+            orgVocabMask_fp32,
+            size_);
+
+        AscendC::LocalTensor<float> addedOffset;
+        AscendC::LocalTensor<float> addedOffsetTensor = end_buf.Get<float>();
+        AscendC::Duplicate(addedOffsetTensor, float(added_offset_), size_);
+
+        AscendC::LocalTensor<half> addedVocabMask_fp16;
+        AscendC::LocalTensor<float> addedVocabMask_fp32;
+        AscendC::Cast(addedVocabMask_fp16, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(addedVocabMask_fp32, addedVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
+
+        AscendC::Mul(addedOffset, 
+            addedOffsetTensor,
+            addedVocabMask_fp32,
+            size_);
+            
+        AscendC::Add(validOffset, validOffset, addedOffset, size_);
+
+        // vocab_mask = org_vocab_mask | added_vocab_mask
+        AscendC::LocalTensor<int8_t> vocabMask = vocabMask_buf_.Get<int8_t>();
+
+        AscendC::Or(vocabMask,
+                    orgVocabMask,
+                    addedVocabMask,
+                    size_);
+                    
+        AscendC::Sub(inputFloat, inputFloat, validOffset, size_);
+
+        // input_ = vocab_mask * (input_ - valid_offset)
+        AscendC::LocalTensor<half> vocabMask_fp16;
+        AscendC::LocalTensor<float> vocabMask_fp32;
+        AscendC::Cast(vocabMask_fp16, vocabMask, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(vocabMask_fp32, vocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
+        
+        AscendC::LocalTensor<float> inputFloat_fp32;
+        AscendC::Mul(inputFloat, inputFloat, vocabMask_fp32, size_);
+
+        AscendC::Cast(maskedLocal, inputFloat, AscendC::RoundMode::CAST_CEIL, size_);  
+        outQueue.EnQue(maskedLocal);
+
+        // ~vocab_mask
+        AscendC::LocalTensor<float> ones_tensor = ones_buf_.Get<float>();
+        AscendC::Duplicate(ones_tensor, (float)1, size_);
+        AscendC::LocalTensor<float> maskLocal_fp32;
+
+        AscendC::Sub(maskLocal_fp32, 
+            ones_tensor,
+            vocabMask_fp32,
+            size_);
+
+        AscendC::LocalTensor<half> maskLocal_fp16;
+        AscendC::Cast(maskLocal_fp16, maskLocal_fp32, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(maskLocal, maskLocal_fp16, AscendC::RoundMode::CAST_NONE, size_);
+        maskQueue.EnQue(maskLocal);
+        inQueue.FreeTensor(inputLocal);
+    }
+
+    __aicore__ inline void CopyOut()
+    {
+        AscendC::LocalTensor<scalar_t> maskedLocal = outQueue.DeQue<scalar_t>();
+        AscendC::LocalTensor<bool> maskLocal = maskQueue.DeQue<bool>();
+        
+        AscendC::DataCopy(maskedOutputGlobal, maskedLocal, size_);
+        AscendC::DataCopy(maskOutGlobal, maskLocal, size_);
+        
+        outQueue.FreeTensor(maskedLocal);
+        maskQueue.FreeTensor(maskLocal);
+    }
+
+private:
+    static constexpr int32_t BUFFER_NUM = 2;
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueue;
+    AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueue, maskQueue;
+    AscendC::GlobalTensor<scalar_t> inputGlobal, maskedOutputGlobal;
+    AscendC::GlobalTensor<bool> maskOutGlobal;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> calc_buf_1;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> calc_buf_2;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_ge_que;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_le_que;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_org_mask_que;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_add_mask_que;
+
+    // Temporary buffers
+    AscendC::TBuf<AscendC::TPosition::VECCALC> start_buf;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> end_buf; 
+    
+    // Temporary buffers continued
+    AscendC::TBuf<AscendC::TPosition::VECCALC> inputFloat_buf;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> validOffset_buf;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> vocabMask_buf_;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> ones_buf_;
+    
+    __gm__ scalar_t *input_, *masked_input_;
+    __gm__ bool *mask_out_;
+    int64_t size_;
+    int64_t org_vocab_start_index_, org_vocab_end_index_;
+    int64_t added_vocab_start_index_, added_vocab_end_index_;
+    int64_t added_offset_;
+
+    static constexpr float MIN_ACCURACY_FP32 = 1.1754943508222875e-38;
+    static constexpr float MAX_MUL_1_FP32 = 1125899906842624;
+    static constexpr float MAX_MUL_2_FP32 = 67108864;
+    static constexpr float NEGATIVE_ONE_FP32 = -1.0f;
+};
+
+extern "C" __global__ __aicore__ void get_masked_input_and_mask_kernel(
+    __gm__ int32_t* input,
+    __gm__ int32_t* masked_input,
+    __gm__ bool* mask_out, 
+    const int64_t org_vocab_start_index,
+    const int64_t org_vocab_end_index,
+    const int64_t num_org_vocab_padding,
+    const int64_t added_vocab_start_index,
+    const int64_t added_vocab_end_index,
+    const int64_t size,
+    const uint32_t loop_cnt,
+    const uint32_t aiv_num)
+{
+    {
+        GetMaskedInputAndMask<int32_t> op{};
+
+        for (int64_t i = AscendC::GetBlockIdx(); i < loop_cnt; i += aiv_num) {
+            op.Init(input + i * size/loop_cnt, 
+                   masked_input + i * size/loop_cnt,
+                   mask_out + i * size/loop_cnt,
+                   org_vocab_start_index, org_vocab_end_index,
+                   num_org_vocab_padding, added_vocab_start_index,
+                   added_vocab_end_index, size/loop_cnt);
+                
+            op.Process();
+        }
+    } // op destructor called here
+}
+
+namespace vllm_ascend {
+
+void get_masked_input_and_mask_impl(
+    void* stream,
+    void* input,
+    void* masked_input,
+    void* mask_out,
+    const int64_t org_vocab_start_index,
+    const int64_t org_vocab_end_index,
+    const int64_t num_org_vocab_padding, 
+    const int64_t added_vocab_start_index,
+    const int64_t added_vocab_end_index,
+    const int64_t size,
+    const uint32_t loop_cnt,
+    const uint32_t aiv_num)
+{
+    get_masked_input_and_mask_kernel<<<aiv_num, nullptr, stream>>>(
+        static_cast<int32_t*>(input),
+        static_cast<int32_t*>(masked_input),
+        static_cast<bool*>(mask_out),
+        org_vocab_start_index,
+        org_vocab_end_index,
+        num_org_vocab_padding,
+        added_vocab_start_index,
+        added_vocab_end_index,
+        size,
+        loop_cnt,
+        aiv_num);
+}
+
+} // namespace vllm_ascend
+    
diff --git a/csrc/ops.h b/csrc/ops.h
index b921b2bf9..b1bc6028e 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -31,6 +31,20 @@ namespace vllm_ascend {
     const int headSize, const int64_t numTokens, const uint32_t loopCnt,
     uint32_t aivNum);
 
+  extern void get_masked_input_and_mask_impl(
+    void* stream,
+    void* input,
+    void* masked_input,
+    void* mask_out,
+    const int64_t org_vocab_start_index,
+    const int64_t org_vocab_end_index,
+    const int64_t num_org_vocab_padding, 
+    const int64_t added_vocab_start_index,
+    const int64_t added_vocab_end_index,
+    const int64_t size,
+    const uint32_t loop_cnt,
+    const uint32_t aiv_num);
+    
   torch::Tensor weak_ref_tensor(torch::Tensor& tensor) {
     if (!tensor.is_privateuseone()) {
       throw std::runtime_error("Tensor must be on NPU device");
diff --git a/csrc/torch_binding.cpp b/csrc/torch_binding.cpp
index c4154381d..001a9314d 100644
--- a/csrc/torch_binding.cpp
+++ b/csrc/torch_binding.cpp
@@ -99,6 +99,112 @@ std::tuple<at::Tensor, at::Tensor> rotary_embedding(at::Tensor &positions, at::T
     return {query_dst, key_dst};
 }
 
+std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
+    at::Tensor &input,
+    const int64_t org_vocab_start_index,
+    const int64_t org_vocab_end_index,
+    const int64_t num_org_vocab_padding,
+    const int64_t added_vocab_start_index,
+    const int64_t added_vocab_end_index)
+    /*
+    https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/vocab_parallel_embedding.py#L161-L198
+    Embedding parallelized in the vocabulary dimension.
+
+    Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
+    make sure it is divisible by the number of model parallel GPUs.
+
+    In order to support various loading methods, we ensure that LoRA-added
+    embeddings are always at the end of TP-sharded tensors. In other words,
+    we shard base embeddings and LoRA embeddings separately (both padded),
+    and place them in the same tensor.
+    In this example, we will have the original vocab size = 1010,
+    added vocab size = 16 and padding to 64. Therefore, the total
+    vocab size with padding will be 1088 (because we first pad 1010 to
+    1024, add 16, and then pad to 1088).
+    Therefore, the tensor format looks like the following:
+    TP1, rank 0 (no sharding):
+                            |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
+    corresponding token_id: |  0  |  1  | ... | 1009 |  -1  | ... |  -1  | 1010 | ... | 1015 |  -1  | ... |  -1  |
+                     index: |  0  |  1  | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
+
+    TP2, rank 0:
+                            |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
+    corresponding token_id: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 1000 | ... | 1015 |  -1  | ... |  -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 527  |  520 | ... | 543 |
+    TP2, rank 1:
+                            |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
+    corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1  | ...  | -1  |  -1  | ... |  -1  | -1  | ... |   -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 519  | 520 | ... |  543 | 
+    Parameters:
+        org_vocab_start_index //base embeddings start
+        org_vocab_end_index //base embeddings end
+        num_org_vocab_padding //base embeddings padding
+        added_vocab_start_index //LoRA embeddings start
+        added_vocab_end_index //LoRA embeddings end
+    */
+{
+    // Input validation
+    TORCH_CHECK(input.dim() >= 1, "input must have at least 1 dimension");
+    TORCH_CHECK(org_vocab_start_index >= 0, "org_vocab_start_index must be non-negative");
+    TORCH_CHECK(org_vocab_end_index >= org_vocab_start_index, "org_vocab_end_index must be greater than org_vocab_start_index");
+    TORCH_CHECK(num_org_vocab_padding >= 0, "num_org_vocab_padding must be non-negative");
+    TORCH_CHECK(added_vocab_start_index >= org_vocab_end_index, "added_vocab_start_index must be greater than org_vocab_end_index");
+    TORCH_CHECK(added_vocab_end_index >= added_vocab_start_index, "added_vocab_end_index must be greater than added_vocab_start_index");
+
+    // Get total number of elements
+    int64_t size = input.numel();
+
+    // Create output tensors
+    at::Tensor masked_input = at::empty_like(input);
+	at::Tensor mask = at::empty_like(input).to(at::kBool);
+    
+    // Get data pointers
+    void *input_ptr = input.data_ptr();
+    void *masked_input_ptr = masked_input.data_ptr();
+    void *mask_ptr = mask.data_ptr();
+    
+    // Get current stream
+    aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
+    
+    // Get scalar type
+    at::ScalarType scalar_type = input.scalar_type();
+    
+    // Create and configure OpCommand
+    at_npu::native::OpCommand cmd;
+    cmd.Name("get_masked_input_and_mask");
+    cmd.SetCustomHandler([scalar_type, size, stream, 
+                         input_ptr, masked_input_ptr, mask_ptr,
+                         org_vocab_start_index, org_vocab_end_index,
+                         num_org_vocab_padding, added_vocab_start_index,
+                         added_vocab_end_index]() -> int {
+        // Get platform info
+        fe::PlatFormInfos platform_infos;
+        int device_id = 0;
+        fe::PlatformInfoManager::GeInstance().GetRuntimePlatformInfosByDevice(device_id, platform_infos);
+        uint32_t aivNum = platform_infos.GetCoreNumByType("aiv");
+        uint32_t loop_cnt = (size + aivNum - 1) / aivNum;
+        
+        // Call implementation
+        get_masked_input_and_mask_impl(
+            stream,
+            input_ptr,
+            masked_input_ptr, 
+            mask_ptr,
+            org_vocab_start_index,
+            org_vocab_end_index,
+            num_org_vocab_padding,
+            added_vocab_start_index,
+            added_vocab_end_index,
+            size,
+            loop_cnt,
+            aivNum);
+            
+        return 0;
+    });
+    cmd.Run();
+    return {masked_input, mask};
+}
+
 void verify_tensor(std::string const& name, at::Tensor const& t,
                           int64_t const size_0, int64_t const size_1,
                           c10::ScalarType const type) {
@@ -194,6 +300,16 @@ TORCH_LIBRARY_EXPAND(_C, ops)
         "                 Tensor! key, int head_size,"
         "                 Tensor cos_sin_cache, bool is_neox) -> (Tensor query, Tensor key)");
     ops.impl("rotary_embedding", torch::kPrivateUse1, &vllm_ascend::rotary_embedding);
+
+    ops.def(
+        "get_masked_input_and_mask(Tensor input, "
+        "                         int org_vocab_start_index, "
+        "                         int org_vocab_end_index, "
+        "                         int num_org_vocab_padding, "
+        "                         int added_vocab_start_index, "
+        "                         int added_vocab_end_index) -> (Tensor masked_input, Tensor mask)");
+    ops.impl("get_masked_input_and_mask", torch::kPrivateUse1, &vllm_ascend::get_masked_input_and_mask);
+
     ops.def(
         "advance_step_flashattn_ascendc(int num_seqs, int num_queries, int block_size,"
         "                               Tensor! input_tokens, Tensor! sampled_token_ids, Tensor! input_positions,"
diff --git a/docs/source/community/contributors.md b/docs/source/community/contributors.md
index efbea6657..d61259de8 100644
--- a/docs/source/community/contributors.md
+++ b/docs/source/community/contributors.md
@@ -1,4 +1,6 @@
-# Maintainers
+# Maintainers and contributors
+
+## Maintainers
 
 | Name | Github ID | Date |
 |:-----------:|:-----:|:-----:|
@@ -6,13 +8,30 @@
 | Yikun Jiang| [@Yikun](https://github.com/Yikun) | 2025/02 |
 | Yi Gan| [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/02 |
 
-# Contributors
+## Contributors
 
 vLLM Ascend every release would not have been possible without the following contributors:
 
+Updated on 2025-06-10:
+
 | Number | Contributor | Date | Commit ID |
 |:------:|:-----------:|:-----:|:---------:|
-| 51 | [@YisongJiang](https://github.com/YisongJiang) | 2025/5/29 | [90afaf6](https://github.com/vllm-project/vllm-ascend/commit/90afaf6306f680307462becf3c78585737579851) |
+| 66 | [@Yuxiao-Xu](https://github.com/Yuxiao-Xu) | 2025/6/9 | [6b853f1](https://github.com/vllm-project/vllm-ascend/commit/6b853f15fe69ba335d2745ebcf14a164d0bcc505) |
+| 65 | [@ChenTaoyu-SJTU](https://github.com/ChenTaoyu-SJTU) | 2025/6/7 | [20dedba](https://github.com/vllm-project/vllm-ascend/commit/20dedba5d1fc84b7ae8b49f9ce3e3649389e2193) |
+| 64 | [@zxdukki](https://github.com/zxdukki) | 2025/6/7 | [87ebaef](https://github.com/vllm-project/vllm-ascend/commit/87ebaef4e4e519988f27a6aa378f614642202ecf) |
+| 63 | [@sdmyzlp](https://github.com/sdmyzlp) | 2025/6/7 | [3640c60](https://github.com/vllm-project/vllm-ascend/commit/3640c60b0eb4d4cb104e20bfa406d3f1d17920a7) |
+| 62 | [@weijinqian0](https://github.com/weijinqian0) | 2025/6/7 | [e9ada68](https://github.com/vllm-project/vllm-ascend/commit/e9ada685ece798f9fe0d4a287e3f5246a8a7207b) |
+| 61 | [@hahazhky](https://github.com/hahazhky) | 2025/6/6 | [0b12c2a](https://github.com/vllm-project/vllm-ascend/commit/0b12c2acf7d9fd192beebebf662298067d9a5435) |
+| 60 | [@depeng1994](https://github.com/depeng1994) | 2025/6/6 | [6b094a2](https://github.com/vllm-project/vllm-ascend/commit/6b094a2bd49a8a41eb3647568b2d9e5b337db81f) |
+| 59 | [@David9857](https://github.com/David9857) | 2025/6/5 | [78431b3](https://github.com/vllm-project/vllm-ascend/commit/78431b34694dfa3c8f54ed7cc626660318557927) |
+| 58 | [@momo609](https://github.com/momo609) | 2025/6/5 | [908a851](https://github.com/vllm-project/vllm-ascend/commit/908a851a776cfd9051cc062119e6ec481561c6f7) |
+| 57 | [@zhangxinyuehfad](https://github.com/zhangxinyuehfad) | 2025/6/5 | [7737aaa](https://github.com/vllm-project/vllm-ascend/commit/7737aaa40f699b233a35fb61e908b687adc1e2e5) |
+| 56 | [@NINGBENZHE](https://github.com/NINGBENZHE) | 2025/6/3 | [6ec64a3](https://github.com/vllm-project/vllm-ascend/commit/6ec64a3f9686df65b5a23a41aa301e669db19099) |
+| 55 | [@XWFAlone](https://github.com/XWFAlone) | 2025/5/30 | [3442fbd](https://github.com/vllm-project/vllm-ascend/commit/3442fbdb235b4c6d72c2bc64a49707a7bd89958e) |
+| 54 | [@YisongJiang](https://github.com/YisongJiang) | 2025/5/29 | [90afaf6](https://github.com/vllm-project/vllm-ascend/commit/90afaf6306f680307462becf3c78585737579851) |
+| 53 | [@ponix-j](https://github.com/ponix-j) | 2025/5/23 | [df58fb8](https://github.com/vllm-project/vllm-ascend/commit/df58fb80eee24139fc61c495be3ce79cf81b3f73) |
+| 52 | [@ttanzhiqiang](https://github.com/ttanzhiqiang) | 2025/5/23 | [dc6172e](https://github.com/vllm-project/vllm-ascend/commit/dc6172efd3860ce95b40a7b3e93611f875f06d40) |
+| 51 | [@yangpuPKU](https://github.com/yangpuPKU) | 2025/5/23 | [46df67a](https://github.com/vllm-project/vllm-ascend/commit/46df67a5e9ab73fade08cbb2d8c0155cee7316d1) |
 | 50 | [@wonderful199082](https://github.com/wonderful199082) | 2025/5/20 | [5cf9ff1](https://github.com/vllm-project/vllm-ascend/commit/5cf9ff18e91b0b7031c258d71a257b8e24689763) |
 | 49 | [@22dimensions](https://github.com/22dimensions) | 2025/5/17 | [a8730e7](https://github.com/vllm-project/vllm-ascend/commit/a8730e7a3c4ac6c4b39a5946c943252fdea6cce5) |
 | 48 | [@cxcxflying](https://github.com/cxcxflying) | 2025/5/13 | [e564470](https://github.com/vllm-project/vllm-ascend/commit/e56447033889ca95df512208cab22ef832bfdf07) |
diff --git a/docs/source/conf.py b/docs/source/conf.py
index ba8d83949..63fb5dbe4 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -69,10 +69,10 @@
     # the branch of vllm-ascend, used in vllm-ascend clone and image tag
     # - main branch: 'main'
     # - vX.Y.Z branch: latest vllm-ascend release tag
-    'vllm_ascend_version': 'v0.9.0rc1',
+    'vllm_ascend_version': 'v0.9.0rc2',
     # the newest release version of vllm-ascend and matched vLLM, used in pip install.
     # This value should be updated when cut down release.
-    'pip_vllm_ascend_version': "0.9.0rc1",
+    'pip_vllm_ascend_version': "0.9.0rc2",
     'pip_vllm_version': "0.9.0",
     # CANN image tag
     'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10",
diff --git a/docs/source/developer_guide/evaluation/accuracy_report/index.md b/docs/source/developer_guide/evaluation/accuracy_report/index.md
new file mode 100644
index 000000000..51fae4886
--- /dev/null
+++ b/docs/source/developer_guide/evaluation/accuracy_report/index.md
@@ -0,0 +1,6 @@
+# Accuracy Report
+
+:::{toctree}
+:caption: Accuracy Report
+:maxdepth: 1
+:::
\ No newline at end of file
diff --git a/docs/source/developer_guide/evaluation/index.md b/docs/source/developer_guide/evaluation/index.md
index 324c2e2f2..ea861abb1 100644
--- a/docs/source/developer_guide/evaluation/index.md
+++ b/docs/source/developer_guide/evaluation/index.md
@@ -6,6 +6,7 @@
 using_lm_eval
 using_opencompass
 using_evalscope
+accuracy_report/index
 :::
 
 :::{toctree}
diff --git a/docs/source/developer_guide/evaluation/profile_execute_duration.md b/docs/source/developer_guide/evaluation/profile_execute_duration.md
index 8989bf9cf..3b37df6be 100644
--- a/docs/source/developer_guide/evaluation/profile_execute_duration.md
+++ b/docs/source/developer_guide/evaluation/profile_execute_duration.md
@@ -9,6 +9,11 @@ The execution duration of each stage (including pre/post-processing, model forwa
 * Use the non-blocking API `ProfileExecuteDuration().capture_async` to set observation points asynchronously when you need to observe the execution duration.
 * Use the blocking API `ProfileExecuteDuration().pop_captured_sync` at an appropriate time to get and print the execution durations of all observed stages.
 
+**We have instrumented the key inference stages (including pre-processing, model forward pass, etc.) for execute duration profiling. Execute the script as follows:**
+```
+VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE=1 python3 vllm-ascend/examples/offline_inference_npu.py
+```
+
 ## Example Output
 
 ```
diff --git a/docs/source/developer_guide/versioning_policy.md b/docs/source/developer_guide/versioning_policy.md
index bbb08c3be..1533b0af5 100644
--- a/docs/source/developer_guide/versioning_policy.md
+++ b/docs/source/developer_guide/versioning_policy.md
@@ -22,6 +22,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin:
 
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  | MindIE Turbo |
 |-------------|--------------|------------------|-------------|--------------------|--------------|
+| v0.9.0rc2   | v0.9.0       | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1      |              |
 | v0.9.0rc1   | v0.9.0       | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1      |              |
 | v0.8.5rc1   | v0.8.5.post1 | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1      |              |
 | v0.8.4rc2   | v0.8.4       | >= 3.9, < 3.12   | 8.0.0       | 2.5.1 / 2.5.1      |              |
@@ -34,6 +35,7 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin:
 
 | Date       | Event                                     |
 |------------|-------------------------------------------|
+| 2025.06.10 | Release candidates, v0.9.0rc2             |
 | 2025.06.09 | Release candidates, v0.9.0rc1             |
 | 2025.05.29 | v0.7.x post release, v0.7.3.post1         |
 | 2025.05.08 | v0.7.x Final release, v0.7.3              |
@@ -71,6 +73,7 @@ Usually, each minor version of vLLM (such as 0.7) will correspond to a vLLM Asce
 | Branch     | Status       | Note                                 |
 |------------|--------------|--------------------------------------|
 | main       | Maintained   | CI commitment for vLLM main branch and vLLM 0.9.x branch   |
+| v0.9.1-dev | Maintained   | CI commitment for vLLM 0.9.0 and 0.9.1 version |
 | v0.7.3-dev | Maintained   | CI commitment for vLLM 0.7.3 version |
 | v0.7.1-dev | Unmaintained | Replaced by v0.7.3-dev               |
 
diff --git a/docs/source/faqs.md b/docs/source/faqs.md
index 8c840be45..1de3befb2 100644
--- a/docs/source/faqs.md
+++ b/docs/source/faqs.md
@@ -3,7 +3,7 @@
 ## Version Specific FAQs
 
 - [[v0.7.3.post1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1007)
-- [[v0.9.0rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1115)
+- [[v0.9.0rc2] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/1115)
 
 ## General FAQs
 
@@ -69,14 +69,14 @@ If all above steps are not working, feel free to submit a GitHub issue.
 
 ### 7. How does vllm-ascend perform?
 
-Currently, only some models are improved. Such as `Qwen2 VL`, `Deepseek  V3`. Others are not good enough. In the future, we will support graph mode and custom ops to improve the performance of vllm-ascend. And when the official release of vllm-ascend is released, you can install `mindie-turbo` with `vllm-ascend` to speed up the inference as well.
+Currently, only some models are improved. Such as `Qwen2 VL`, `Deepseek  V3`. Others are not good enough. From 0.9.0rc2, Qwen and Deepseek works with graph mode to play a good performance. What's more, you can install `mindie-turbo` with `vllm-ascend v0.7.3` to speed up the inference as well.
 
 ### 8. How vllm-ascend work with vllm?
 vllm-ascend is a plugin for vllm. Basically, the version of vllm-ascend is the same as the version of vllm. For example, if you use vllm 0.7.3, you should use vllm-ascend 0.7.3 as well. For main branch, we will make sure `vllm-ascend` and `vllm` are compatible by each commit.
 
 ### 9. Does vllm-ascend support Prefill Disaggregation feature?
 
-Currently, only 1P1D is supported by vllm. For vllm-ascend, it'll be done by [this PR](https://github.com/vllm-project/vllm-ascend/pull/432). For NPND, vllm is not stable and fully supported yet. We will make it stable and supported by vllm-ascend in the future.
+Currently, only 1P1D is supported on V0 Engine. For V1 Engine or NPND support, We will make it stable and supported by vllm-ascend in the future.
 
 ### 10. Does vllm-ascend support quantization method?
 
@@ -84,9 +84,7 @@ Currently, w8a8 quantization is already supported by vllm-ascend originally on v
 
 ### 11. How to run w8a8 DeepSeek model?
 
-Currently, w8a8 DeepSeek is working in process: [support AscendW8A8 quantization](https://github.com/vllm-project/vllm-ascend/pull/511)
-
-Please run DeepSeek with BF16 now, following the [Multi-Node DeepSeek inferencing tutorail](https://vllm-ascend.readthedocs.io/en/main/tutorials/multi_node.html)
+Please following the [quantization inferencing tutorail](https://vllm-ascend.readthedocs.io/en/main/tutorials/multi_npu_quantization.html) and replace model to DeepSeek.
 
 ### 12. There is not output in log when loading models using vllm-ascend, How to solve it?
 
@@ -115,3 +113,13 @@ In scenarios where NPUs have limited HBM (High Bandwidth Memory) capacity, dynam
 - **Adjust `--gpu-memory-utilization`**: If unspecified, will use the default value of `0.9`. You can decrease this param to reserve more memory to reduce fragmentation risks. See more note in: [vLLM - Inference and Serving - Engine Arguments](https://docs.vllm.ai/en/latest/serving/engine_args.html#vllm.engine.arg_utils-_engine_args_parser-cacheconfig).
 
 - **Configure `PYTORCH_NPU_ALLOC_CONF`**: Set this environment variable to optimize NPU memory management. For example, you can `export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True` to enable virtual memory feature to mitigate memory fragmentation caused by frequent dynamic memory size adjustments during runtime, see more note in: [PYTORCH_NPU_ALLOC_CONF](https://www.hiascend.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html).
+
+### 15. Failed to enable NPU graph mode when running DeepSeek?
+You may encounter the following error if running DeepSeek with NPU graph mode enabled. The allowed number of queries per kv when enabling both MLA and Graph mode only support {32, 64, 128}, **Thus this is not supported for DeepSeek-V2-Lite**, as it only has 16 attention heads. The NPU graph mode support on DeepSeek-V2-Lite will be done in the future.
+
+And if you're using DeepSeek-V3 or DeepSeek-R1, please make sure after the tensor parallel split, num_heads / num_kv_heads in {32, 64, 128}.
+
+```bash
+[rank0]: RuntimeError: EZ9999: Inner Error!
+[rank0]: EZ9999: [PID: 62938] 2025-05-27-06:52:12.455.807 numHeads / numKvHeads = 8, MLA only support {32, 64, 128}.[FUNC:CheckMlaAttrs][FILE:incre_flash_attention_tiling_check.cc][LINE:1218]
+```
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 8fbe1825d..c9f684491 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -78,17 +78,17 @@ source vllm-ascend-env/bin/activate
 pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple attrs 'numpy<2.0.0' decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
 
 # Download and install the CANN package.
-wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run
+wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run
 chmod +x ./Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run
 ./Ascend-cann-toolkit_8.1.RC1_linux-"$(uname -i)".run --full
 
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
-wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run
+wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run
 chmod +x ./Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run
 ./Ascend-cann-kernels-910b_8.1.RC1_linux-"$(uname -i)".run --install
 
-wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run
+wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.1.RC1/Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run
 chmod +x ./Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run
 ./Ascend-cann-nnal_8.1.RC1_linux-"$(uname -i)".run --install
 
diff --git a/docs/source/user_guide/additional_config.md b/docs/source/user_guide/additional_config.md
index 51b8e5431..d4756ef5e 100644
--- a/docs/source/user_guide/additional_config.md
+++ b/docs/source/user_guide/additional_config.md
@@ -40,11 +40,13 @@ The details of each config option are as follows:
 | Name | Type | Default | Description |
 | ---- | ---- | ------- | ----------- |
 | `enabled` | bool | `False` | Whether to enable torchair graph mode |
+| `enable_multistream_mla`| bool | `False` | Whether to put vector ops of MLA to another stream |
+| `enable_multistream_moe`| bool | `False` | Whether to enable multistream shared expert |
 | `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization |
 | `use_cached_graph` | bool | `False` | Whether to use cached graph |
 | `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
 | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
-| `enable_multistream_shared_expert`| bool | `False` | Whether to enable multistream shared expert |
+| `enable_kv_nz`| bool | `False` | Whether to enable kvcache NZ layout |
 
 **ascend_scheduler_config**
 
@@ -52,7 +54,7 @@ The details of each config option are as follows:
 | ---- | ---- | ------- | ----------- |
 | `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|
 
-ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you  can add `chunked_prefill_enabled: true` to ascend_scheduler_config as well.
+ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well.
 
 ### Example
 
@@ -61,17 +63,18 @@ A full example of additional configuration is as follows:
 ```
 {
     "torchair_graph_config": {
-        "enabled": true,
-        "use_cached_graph": true,
+        "enabled": True,
+        "use_cached_graph": True,
         "graph_batch_sizes": [1, 2, 4, 8],
-        "graph_batch_sizes_init": false,
-        "enable_multistream_shared_expert": false
+        "graph_batch_sizes_init": False,
+        "enable_multistream_moe": False,
+        "enable_kv_nz": False
     },
     "ascend_scheduler_config": {
-        "enabled": true,
-        "chunked_prefill_enabled": true,
+        "enabled": True,
+        "enable_chunked_prefill": True,
     },
     "expert_tensor_parallel_size": 1,
-    "refresh": false,
+    "refresh": False,
 }
 ```
diff --git a/docs/source/user_guide/graph_mode.md b/docs/source/user_guide/graph_mode.md
index 2bd83ffe7..38d356249 100644
--- a/docs/source/user_guide/graph_mode.md
+++ b/docs/source/user_guide/graph_mode.md
@@ -23,7 +23,7 @@ import os
 
 from vllm import LLM
 
-os.environ["VLLM_USE_V1"] = 1
+os.environ["VLLM_USE_V1"] = "1"
 
 model = LLM(model="Qwen/Qwen2-7B-Instruct")
 outputs = model.generate("Hello, how are you?")
@@ -45,16 +45,17 @@ offline example:
 import os
 from vllm import LLM
 
-os.environ["VLLM_USE_V1"] = 1
+os.environ["VLLM_USE_V1"] = "1"
 
-model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enable": True}})
+# TorchAirGraph is only work without chunked-prefill now
+model = LLM(model="deepseek-ai/DeepSeek-R1-0528", additional_config={"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True,}})
 outputs = model.generate("Hello, how are you?")
 ```
 
 online example:
 
 ```shell
-vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enable": True}}'
+vllm serve Qwen/Qwen2-7B-Instruct --additional-config='{"torchair_graph_config": {"enabled": True},"ascend_scheduler_config": {"enabled": True,}}'
 ```
 
 You can find more detail about additional config [here](./additional_config.md)
@@ -69,7 +70,7 @@ offline example:
 import os
 from vllm import LLM
 
-os.environ["VLLM_USE_V1"] = 1
+os.environ["VLLM_USE_V1"] = "1"
 
 model = LLM(model="someother_model_weight", enforce_eager=True)
 outputs = model.generate("Hello, how are you?")
diff --git a/docs/source/user_guide/release_notes.md b/docs/source/user_guide/release_notes.md
index 42a944f44..8f72b937e 100644
--- a/docs/source/user_guide/release_notes.md
+++ b/docs/source/user_guide/release_notes.md
@@ -1,5 +1,13 @@
 # Release note
 
+## v0.9.0rc2 - 2025.06.10
+
+This release contains some quick fixes for v0.9.0rc1. Please use this release instead of v0.9.0rc1.
+
+### Highlights
+
+- Fix the import error when vllm-ascend is installed without editable way. [#1152](https://github.com/vllm-project/vllm-ascend/pull/1152)
+
 ## v0.9.0rc1 - 2025.06.09
 
 This is the 1st release candidate of v0.9.0 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. From this release, V1 Engine is recommended to use. The code of V0 Engine is frozen and will not be maintained any more. Please set environment `VLLM_USE_V1=1` to enable V1 Engine.
diff --git a/examples/disaggregated_prefill/find_device_ips.py b/examples/disaggregated_prefill/find_device_ips.py
index 205afbf78..48fd7b9d3 100644
--- a/examples/disaggregated_prefill/find_device_ips.py
+++ b/examples/disaggregated_prefill/find_device_ips.py
@@ -30,38 +30,40 @@
 HCCN_TOOL_PATH = envs.HCCN_PATH
 
 
-def get_device_ips(world_size: int):
-    npu_info = subprocess.run(
-        ["npu-smi", "info", "-m"],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        universal_newlines=True,
-    )
+def get_device_ips():
+    npu_info = subprocess.run(['npu-smi', 'info', '-m'],
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE,
+                              universal_newlines=True)
     if npu_info.returncode != 0 or not os.path.exists(HCCN_TOOL_PATH):
         raise RuntimeError("No npu-smi/hccn_tool tools provided for NPU.")
-    npu_start_idx = int(
-        re.match(r".*\n\t([0-9]+).*",
-                 npu_info.stdout).group(1))  # type: ignore
+
+    # ‌Extract NPU IDs for all Ascend devices (excluding Mcu rows)
+    device_ids = []
+    for line in npu_info.stdout.strip().split('\n'):
+        match = re.match(r'^\s*(\d+)\s+\d+\s+\d+\s+Ascend', line)
+        if match:
+            device_ids.append(int(match.group(1)))
+
+    if not device_ids:
+        raise RuntimeError(
+            "Cannot parse any valid device ID from npu-smi output.")
+
     device_ip_list = []
-    for ip_offset in range(world_size):
-        cmd = [
-            HCCN_TOOL_PATH,
-            "-i",
-            f"{npu_start_idx + ip_offset}",
-            "-ip",
-            "-g",
-        ]
-        device_ip_info = subprocess.run(
-            cmd,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            universal_newlines=True,
-        )
-        device_ip = re.match(r"ipaddr:(.*)\n",
-                             device_ip_info.stdout).group(1)  # type: ignore
+    for device_id in device_ids:
+        cmd = [HCCN_TOOL_PATH, '-i', str(device_id), '-ip', '-g']
+        device_ip_info = subprocess.run(cmd,
+                                        stdout=subprocess.PIPE,
+                                        stderr=subprocess.PIPE,
+                                        universal_newlines=True)
+        ip_match = re.search(r'ipaddr:(.*)', device_ip_info.stdout)
+        if not ip_match:
+            raise RuntimeError(
+                f"Cannot parse IP from hccn_tool for device {device_id}")
+        device_ip = ip_match.group(1).strip()
         device_ip_list.append(device_ip)
+
     return device_ip_list
 
 
-# Pass number of NPUs into this function.
-print(get_device_ips(8))
+print(get_device_ips())
diff --git a/examples/offline_multi_step_custom_ops.py b/examples/offline_multi_step_custom_ops.py
index 82a1bf575..59c7fafcc 100644
--- a/examples/offline_multi_step_custom_ops.py
+++ b/examples/offline_multi_step_custom_ops.py
@@ -19,9 +19,6 @@
 
 from vllm import LLM, SamplingParams
 
-import vllm_ascend.platform as pf
-
-pf.CUSTOM_OP_ENABLED = True  # set True for custom Ops of Multi-Step.
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
diff --git a/examples/run_dp_attention_etp16.sh b/examples/run_dp_attention_etp16.sh
new file mode 100644
index 000000000..b73649241
--- /dev/null
+++ b/examples/run_dp_attention_etp16.sh
@@ -0,0 +1,23 @@
+export VLLM_ENABLE_MC2=0
+export VLLM_USE_V1=1
+export TASK_QUEUE_ENABLE=1
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+source /usr/local/Ascend/nnal/atb/set_env.sh
+export ASCEND_LAUNCH_BLOCKING=0
+export VLLM_VERSION=0.9.0
+
+nohup python -m vllm.entrypoints.openai.api_server --model=/mnt/deepseek/DeepSeek-R1-W8A8-VLLM \
+    --quantization ascend \
+    --trust-remote-code \
+    --distributed-executor-backend=mp \
+    --port 8006 \
+    -tp=8 \
+    -dp=2 \
+    --max-num-seqs 24 \
+    --max-model-len 32768 \
+    --max-num-batched-tokens 32768 \
+    --block-size 128 \
+    --no-enable-prefix-caching \
+    --additional-config '{"torchair_graph_config":{"enabled":true,"use_cached_graph":true,"graph_batch_sizes":[24]},"ascend_scheduler_config":{"enabled":true},"expert_tensor_parallel_size":16}' \
+    --gpu-memory-utilization 0.96 &> run.log &
+disown
\ No newline at end of file
diff --git a/examples/run_dp_attention_etp16_benmark.sh b/examples/run_dp_attention_etp16_benmark.sh
new file mode 100644
index 000000000..ab72b3b28
--- /dev/null
+++ b/examples/run_dp_attention_etp16_benmark.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# Concurrency array
+concurrency_array=(48)
+#best rate
+rate_array=(0.7)
+
+# Result file
+result_file="benchmark_results.txt"
+echo "Benchmark Results" > $result_file
+echo "===================" >> $result_file
+
+# Loop through all combinations
+for concurrency in "${concurrency_array[@]}"; do
+    for rate in "${rate_array[@]}"; do
+        echo "Testing with concurrency=$concurrency, rate=$rate"
+        echo "" >> $result_file
+        echo "Concurrency: $concurrency, Request Rate: $rate" >> $result_file
+        echo "-------------------" >> $result_file
+
+        # Run benchmark test
+        python /mnt/deepseek/vllm/benchmarks/benchmark_serving.py \
+            --backend vllm \
+            --trust-remote-code \
+            --model /mnt/deepseek/DeepSeek-R1-W8A8-VLLM \
+            --dataset-name random \
+            --random-input-len 4096 \
+            --random-output-len 1536 \
+            --ignore-eos \
+            --num-prompts 400 \
+            --max-concurrency $concurrency \
+            --request-rate $rate \
+            --metric-percentiles 90 \
+            --base-url http://localhost:8006 2>&1 | tee -a $result_file
+
+        # Wait for system cool down
+        sleep 30
+    done
+done
+
+# Analyze results
+echo "Analysis Results" > analysis_results.txt
+echo "=================" >> analysis_results.txt
+
+# Extract and analyze TPOT data
+echo "TPOT Analysis:" >> analysis_results.txt
+grep "Mean TPOT" $result_file | awk -F':' '{
+    printf "Concurrency %s, Rate %s: %s ms\n", $1, $2, $3
+}' >> analysis_results.txt
+
+# Extract and analyze throughput data
+echo -e "\nThroughput Analysis:" >> analysis_results.txt
+grep "Output token throughput" $result_file | awk -F':' '{
+    printf "Concurrency %s, Rate %s: %s tokens/s\n", $1, $2, $3
+}' >> analysis_results.txt
+
+echo "Testing completed. Results saved in $result_file and analysis in analysis_results.txt"
diff --git a/mypy.ini b/mypy.ini
index 72b03de21..6fe8e6c29 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -6,6 +6,9 @@ warn_unused_configs = True
 [mypy-torch_npu.*]
 ignore_missing_imports = True
 
+[mypy-torchair.*]
+ignore_missing_imports = True
+
 [mypy-transformers.*]
 ignore_missing_imports = True
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 16bbc8027..e0d70a19d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -17,6 +17,7 @@
 # Adapted from vllm-project/vllm/blob/main/tests/conftest.py
 #
 
+import contextlib
 import gc
 from typing import List, Optional, Tuple, TypeVar, Union
 
@@ -53,11 +54,17 @@
 PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 
 
-def cleanup_dist_env_and_memory():
+def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
     destroy_model_parallel()
     destroy_distributed_environment()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    if shutdown_ray:
+        import ray  # Lazy import Ray
+        ray.shutdown()
     gc.collect()
     torch.npu.empty_cache()
+    torch.npu.reset_peak_memory_stats()
 
 
 class VllmRunner:
diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py
index dc02c4b97..f5ec2c872 100644
--- a/tests/multicard/test_offline_inference_distributed.py
+++ b/tests/multicard/test_offline_inference_distributed.py
@@ -23,7 +23,7 @@
 import os
 from unittest.mock import patch
 
-import vllm  # noqa: F401
+from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams
 
 from tests.conftest import VllmRunner
@@ -95,3 +95,20 @@ def test_models_distributed_DeepSeek_dbo():
             distributed_executor_backend="mp",
     ) as vllm_model:
         vllm_model.generate(example_prompts, sampling_params)
+
+
+def test_models_distributed_DeepSeek_W8A8():
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+
+    with VllmRunner(
+            snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"),
+            max_model_len=8192,
+            enforce_eager=True,
+            dtype="auto",
+            tensor_parallel_size=4,
+            quantization="ascend",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/multicard/test_torchair_graph_mode.py b/tests/multicard/test_torchair_graph_mode.py
new file mode 100644
index 000000000..d06ec7de2
--- /dev/null
+++ b/tests/multicard/test_torchair_graph_mode.py
@@ -0,0 +1,80 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/multicard/test_torchair_graph_mode.py`.
+"""
+import os
+
+import pytest
+
+from tests.conftest import VllmRunner
+
+os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="torchair graph is not supported on v0")
+def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_MODELSCOPE", "True")
+        m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+
+        example_prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        dtype = "half"
+        max_tokens = 5
+        # torchair is only work without chunked-prefill now
+        with VllmRunner(
+                "vllm-ascend/DeepSeek-V3-Pruning",
+                dtype=dtype,
+                tensor_parallel_size=4,
+                distributed_executor_backend="mp",
+                additional_config={
+                    "torchair_graph_config": {
+                        "enabled": True,
+                    },
+                    "ascend_scheduler_config": {
+                        "enabled": True,
+                    },
+                    "refresh": True,
+                },
+                enforce_eager=False,
+        ) as vllm_model:
+            # use greedy sampler to make sure the generated results are fix
+            vllm_output = vllm_model.generate_greedy(example_prompts,
+                                                     max_tokens)
+        # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of
+        # DeepSeek-V3 with 2 hidden layers, thus the golden results seems
+        # inaccurate. This will only change if accuracy improves with the
+        # official weights of DeepSeek-V3.
+        golden_results = [
+            'Hello, my name is feasibility伸 spazio debtor添',
+            'The president of the United States is begg"""\n杭州风和 bestimm',
+            'The capital of France is frequentlyশามalinkAllowed',
+            'The future of AI is deleting俯احت怎么样了حراف',
+        ]
+
+        assert len(golden_results) == len(vllm_output)
+        for i in range(len(vllm_output)):
+            assert golden_results[i] == vllm_output[i][1]
+            print(f"Generated text: {vllm_output[i][1]!r}")
diff --git a/tests/ops/test_vocabparallelembedding.py b/tests/ops/test_vocabparallelembedding.py
new file mode 100644
index 000000000..97d6c7059
--- /dev/null
+++ b/tests/ops/test_vocabparallelembedding.py
@@ -0,0 +1,91 @@
+from typing import Tuple
+
+import pytest
+import torch
+import torch_npu  # noqa: F401
+
+import vllm_ascend.platform  # noqa: F401
+
+# Test parameters
+DTYPES = [torch.int32]
+#SHAPES = [(100,), (5, 20), (3, 4, 5)]  # Various tensor shapes
+#SHAPES = [(3, 4, 8), (3, 4, 5)]  # Various tensor shapes
+SHAPES = [(3, 4, 3)]
+DEVICES = [f"npu:{0}"]
+SEEDS = [0]
+
+
+def get_masked_input_and_mask_ref(
+        input_: torch.Tensor, org_vocab_start_index: int,
+        org_vocab_end_index: int, num_org_vocab_padding: int,
+        added_vocab_start_index: int,
+        added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Reference implementation for verification"""
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
+                                                          org_vocab_end_index)
+    added_vocab_mask = (input_ >= added_vocab_start_index) & (
+        input_ < added_vocab_end_index)
+    added_offset = added_vocab_start_index - (
+        org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
+    valid_offset = (org_vocab_start_index *
+                    org_vocab_mask) + (added_offset * added_vocab_mask)
+    vocab_mask = org_vocab_mask | added_vocab_mask
+    masked_input = vocab_mask * (input_ - valid_offset)
+    return masked_input, ~vocab_mask
+
+
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_get_masked_input_and_mask(
+    shape: Tuple[int, ...],
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+) -> None:
+    # Set random seed
+    torch.manual_seed(seed)
+    torch.set_default_device(device)
+
+    # Generate random input tensor
+    input_tensor = torch.randint(0, 1000, shape, dtype=dtype)
+
+    # Test parameters
+    test_case = {
+        "org_start": 100,
+        "org_end": 200,
+        "padding": 0,
+        "added_start": 300,
+        "added_end": 400,
+    }
+
+    # Get reference result
+    ref_masked_input, ref_mask = get_masked_input_and_mask_ref(
+        input_tensor, test_case["org_start"], test_case["org_end"],
+        test_case["padding"], test_case["added_start"], test_case["added_end"])
+
+    # Get custom op result
+    print("input_tensor:", input_tensor)
+    custom_masked_input, custom_mask = torch.ops._C.get_masked_input_and_mask(
+        input_tensor, test_case["org_start"], test_case["org_end"],
+        test_case["padding"], test_case["added_start"], test_case["added_end"])
+
+    ref_masked_input = ref_masked_input.to(dtype)
+    print("custom_masked_input:", custom_masked_input)
+    print("ref_masked_input:", ref_masked_input)
+    print("custom_mask:", custom_mask)
+    print("ref_mask:", ref_mask)
+    # Compare results
+    torch.testing.assert_close(
+        custom_masked_input,
+        ref_masked_input,
+        rtol=1e-5,
+        atol=1e-5,
+        msg=f"Masked input mismatch for case: {test_case}")
+    torch.testing.assert_close(custom_mask,
+                               ref_mask,
+                               rtol=1e-5,
+                               atol=1e-5,
+                               msg=f"Mask mismatch for case: {test_case}")
diff --git a/tests/singlecard/compile/test_simple.py b/tests/singlecard/compile/test_simple.py
index 64d4cba67..70b89297a 100644
--- a/tests/singlecard/compile/test_simple.py
+++ b/tests/singlecard/compile/test_simple.py
@@ -14,8 +14,6 @@
                          set_current_vllm_config)
 from vllm.utils import direct_register_custom_op
 
-from vllm_ascend.utils import vllm_version_is
-
 global_counter = 0
 
 # create a library to hold the custom op
@@ -93,28 +91,14 @@ def test_simple_piecewise_compile():
         model = SillyModel(vllm_config=vllm_config, prefix="")
 
     inputs = torch.randn(100).npu()
-
-    if vllm_version_is("0.9.0"):
-        kwargs = {
-            "num_graphs_seen": 1,  # one graph for the model
-            "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
-            "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
-            "num_backend_compilations":
-            3,  # num_piecewise_capturable_graphs_seen
-            "num_cudagraph_caputured":
-            6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-        }
-    else:
-        kwargs = {
-            "num_graphs_seen": 1,  # one graph for the model
-            "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
-            "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
-            "num_backend_compilations":
-            3,  # num_piecewise_capturable_graphs_seen
-            "num_cudagraph_captured":
-            6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
-        }
-
+    kwargs = {
+        "num_graphs_seen": 1,  # one graph for the model
+        "num_piecewise_graphs_seen": 5,  # 2 * num_layers + 1
+        "num_piecewise_capturable_graphs_seen": 3,  # 1 + num_layers
+        "num_backend_compilations": 3,  # num_piecewise_capturable_graphs_seen
+        "num_cudagraph_captured":
+        6  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    }
     with compilation_counter.expect(kwargs):
 
         model(inputs)
diff --git a/tests/singlecard/core/__init__.py b/tests/singlecard/core/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/singlecard/core/test_ascend_scheduler.py b/tests/singlecard/core/test_ascend_scheduler.py
new file mode 100644
index 000000000..c382ebdf4
--- /dev/null
+++ b/tests/singlecard/core/test_ascend_scheduler.py
@@ -0,0 +1,792 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import pytest
+import torch
+from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
+                         SchedulerConfig, SpeculativeConfig, VllmConfig)
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec)
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.request import Request, RequestStatus
+from vllm.v1.structured_output import StructuredOutputManager
+
+from vllm_ascend.core.scheduler import AscendScheduler
+from vllm_ascend.utils import vllm_version_is
+
+EOS_TOKEN_ID = 50256
+
+
+def create_scheduler(
+    model: str = "Qwen/Qwen2.5-0.5B-Instruct",
+    max_num_seqs: int = 16,
+    max_num_batched_tokens: int = 8192,
+    enable_prefix_caching: Optional[bool] = None,
+    long_prefill_token_threshold: int = 0,
+    disable_chunked_mm_input: bool = False,
+    use_kv_connector: bool = False,
+    num_blocks: int = 10000,
+    block_size: int = 16,
+    max_model_len: Optional[int] = None,
+    num_speculative_tokens: Optional[int] = None,
+    enable_chunked_prefill: bool = False,
+) -> AscendScheduler:
+    '''Create scheduler under test.
+
+    Args:
+      model: model under test
+      max_num_seqs: max sequences to schedule
+      max_num_batch_tokens: max num tokens to batch
+      enable_prefix_caching: optionally force APC config
+                             (True/False) or use default
+                             (None)
+
+    Returns:
+      {class}`Scheduler` instance
+    '''
+    if max_model_len is None:
+        max_model_len = max_num_batched_tokens
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_model_len=max_model_len,
+        long_prefill_token_threshold=long_prefill_token_threshold,
+        disable_chunked_mm_input=disable_chunked_mm_input,
+        enable_chunked_prefill=enable_chunked_prefill,
+    )
+    model_config = ModelConfig(
+        model=model,
+        task="auto",
+        tokenizer=model,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
+    # Cache config, optionally force APC
+    kwargs_cache = ({} if enable_prefix_caching is None else {
+        'enable_prefix_caching': enable_prefix_caching
+    })
+    cache_config = CacheConfig(
+        block_size=block_size,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+        **kwargs_cache,
+    )
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="SharedStorageConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={"shared_storage_path": "local_storage"},
+    ) if use_kv_connector else None
+
+    speculative_config: Optional[SpeculativeConfig] = None
+    if num_speculative_tokens is not None:
+        speculative_config = SpeculativeConfig(
+            model="ngram", num_speculative_tokens=num_speculative_tokens)
+
+    vllm_config = VllmConfig(
+        scheduler_config=scheduler_config,
+        model_config=model_config,
+        cache_config=cache_config,
+        kv_transfer_config=kv_transfer_config,
+        speculative_config=speculative_config,
+    )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,  # A large number of blocks to hold all requests
+        **({
+            "tensors": {}
+        } if vllm_version_is("0.9.0") else {
+            "kv_cache_tensors": []
+        }),
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(block_size, 1, 1, torch.float32,
+                                               False))
+        ],
+    )
+    cache_config.num_gpu_blocks = num_blocks
+    return AscendScheduler(
+        vllm_config=vllm_config,
+        kv_cache_config=kv_cache_config,
+        log_stats=True,
+        structured_output_manager=StructuredOutputManager(vllm_config),
+    )
+
+
+def create_requests(num_requests: int,
+                    num_tokens: int = 10,
+                    mm_positions: Optional[list[PlaceholderRange]] = None,
+                    max_tokens: int = 16,
+                    stop_token_ids: Optional[list[int]] = None,
+                    prompt_logprobs: Optional[int] = None):
+    sampling_params = SamplingParams(ignore_eos=False,
+                                     max_tokens=max_tokens,
+                                     stop_token_ids=stop_token_ids,
+                                     prompt_logprobs=prompt_logprobs)
+    requests = []
+    for i in range(num_requests):
+        if mm_positions is not None:
+            mm_position = mm_positions[i]
+            mm_inputs = [MultiModalKwargs({})] * len(mm_position)
+        else:
+            mm_position = None
+            mm_inputs = None
+        request = Request(
+            request_id=f"{i}",
+            prompt_token_ids=[i] * num_tokens,
+            sampling_params=sampling_params,
+            multi_modal_inputs=mm_inputs,
+            multi_modal_placeholders=mm_position,
+            multi_modal_hashes=None,
+            eos_token_id=EOS_TOKEN_ID,
+            **({
+                "arrival_time": 0.0
+            } if vllm_version_is("0.9.0") else {}),
+        )
+        requests.append(request)
+    return requests
+
+
+def test_add_requests():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        assert request.request_id in scheduler.requests
+        assert len(scheduler.waiting) == i + 1
+
+
+def test_finish_request():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+    for request in requests:
+        scheduler.add_request(request)
+
+    for i, request in enumerate(requests):
+        scheduler.finish_requests(request.request_id,
+                                  RequestStatus.FINISHED_ABORTED)
+        assert request.request_id not in scheduler.requests
+        assert len(scheduler.waiting) == 9 - i
+
+
+def test_get_num_unfinished_requests():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+    for request in requests:
+        scheduler.add_request(request)
+
+    for i, request in enumerate(requests):
+        scheduler.finish_requests(request.request_id,
+                                  RequestStatus.FINISHED_STOPPED)
+        assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1
+
+
+@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
+    (None, None),
+    (True, 5),
+])
+def test_schedule(enable_prefix_caching: Optional[bool],
+                  prompt_logprobs: Optional[int]):
+    '''Test scheduling. 
+    Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
+    '''
+    scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
+    requests = create_requests(num_requests=10,
+                               prompt_logprobs=prompt_logprobs)
+    for request in requests:
+        scheduler.add_request(request)
+
+    # Test initial scheduling
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+    # Verify all requests are scheduled.
+    for req_id, num_tokens in output.num_scheduled_tokens.items():
+        assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
+
+    # Verify requests moved from waiting to running
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == len(requests)
+    for i, request in enumerate(requests):
+        assert scheduler.running[i] == request
+
+
+@pytest.mark.parametrize("enable_prefix_caching", [True, False])
+def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
+    """Test scheduling behavior with concurrent partial requests.
+
+    This test verifies that: there are multiple long prefill requests in the
+    RUNNING state, and we can schedule them together.
+
+    """
+    scheduler = create_scheduler(
+        model="facebook/opt-125m",
+        max_num_batched_tokens=1024,
+        long_prefill_token_threshold=400,
+        enable_prefix_caching=enable_prefix_caching,
+        enable_chunked_prefill=True,
+    )
+    requests = create_requests(
+        num_requests=3,
+        num_tokens=800,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 3
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+
+    # The first request is scheduled partially - 400.
+    assert output.num_scheduled_tokens[requests[0].request_id] == 400
+    # The second request is scheduled partially - 400.
+    assert output.num_scheduled_tokens[requests[1].request_id] == 400
+    # The third request is also scheduled partially - 1024 - 400 - 400 = 224.
+    assert output.num_scheduled_tokens[requests[2].request_id] == 224
+    req_to_index = {
+        request.request_id: i
+        for i, request in enumerate(requests)
+    }
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[] for _ in range(len(requests))],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(output, model_runner_output)
+
+    # Schedule the next step. All three requests are running.
+    # Processed the remaining prefills of the first and second requests.
+    output1 = scheduler.schedule()
+    assert len(scheduler.running) == 3
+    assert len(output1.scheduled_new_reqs) == 0
+    assert len(output1.scheduled_cached_reqs) == 3
+    assert len(output1.finished_req_ids) == 0
+    assert output1.num_scheduled_tokens[requests[0].request_id] == 400
+    assert output1.num_scheduled_tokens[requests[1].request_id] == 400
+    assert output1.num_scheduled_tokens[requests[2].request_id] == 224
+
+    # Schedule the third step. All three requests are running.
+    # First and second requests are in the decode stage.
+    # All the remaining tokens in the third request are processed.
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(output1, model_runner_output)
+    output2 = scheduler.schedule()
+    assert len(scheduler.running) == 3
+    assert len(output2.scheduled_new_reqs) == 0
+    assert len(output2.scheduled_cached_reqs) == 3
+    assert len(output2.finished_req_ids) == 0
+    assert output2.num_scheduled_tokens[requests[0].request_id] == 1
+    assert output2.num_scheduled_tokens[requests[1].request_id] == 1
+    assert output2.num_scheduled_tokens[
+        requests[2].request_id] == 800 - 224 - 224
+
+
+def test_stop_via_update_from_output():
+    """Test stopping behavior through update_from_output"""
+    scheduler = create_scheduler(num_speculative_tokens=1)
+
+    # Test case 1: Stop on EOS token
+    requests = create_requests(num_requests=2, max_tokens=10)
+    for req in requests:
+        req.num_computed_tokens = req.num_tokens
+        scheduler.requests[req.request_id] = req
+        scheduler.running.append(req)
+
+    scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
+                                       scheduled_cached_reqs=[],
+                                       num_scheduled_tokens={
+                                           requests[0].request_id: 1,
+                                           requests[1].request_id: 2
+                                       },
+                                       total_num_scheduled_tokens=3,
+                                       scheduled_encoder_inputs={},
+                                       scheduled_spec_decode_tokens={
+                                           requests[0].request_id: [],
+                                           requests[1].request_id: [10]
+                                       },
+                                       num_common_prefix_blocks=0,
+                                       finished_req_ids=set(),
+                                       free_encoder_input_ids=[],
+                                       structured_output_request_ids={},
+                                       grammar_bitmask=None)
+
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={req.request_id: i
+                         for i, req in enumerate(requests)},
+        sampled_token_ids=[[EOS_TOKEN_ID],
+                           [10,
+                            11]],  # First request hits EOS, second continues
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={})
+
+    scheduler.update_from_output(scheduler_output, model_output)
+
+    # Verify first request stopped, second continues
+    assert len(scheduler.running) == 1
+    assert scheduler.running[0].request_id == requests[1].request_id
+    assert requests[0].status == RequestStatus.FINISHED_STOPPED
+    assert requests[0].request_id in scheduler.finished_req_ids
+    assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID]
+    assert list(requests[1].output_token_ids) == [10, 11]
+
+    # Test case 2: Stop on custom stop token
+    scheduler = create_scheduler(num_speculative_tokens=2)
+    requests = create_requests(num_requests=2,
+                               max_tokens=10,
+                               stop_token_ids=[42, 43])
+    for req in requests:
+        req.num_computed_tokens = req.num_tokens
+        scheduler.requests[req.request_id] = req
+        scheduler.running.append(req)
+
+    scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
+                                       scheduled_cached_reqs=[],
+                                       num_scheduled_tokens={
+                                           requests[0].request_id: 3,
+                                           requests[1].request_id: 2
+                                       },
+                                       total_num_scheduled_tokens=5,
+                                       scheduled_encoder_inputs={},
+                                       scheduled_spec_decode_tokens={
+                                           requests[0].request_id: [10, 42],
+                                           requests[1].request_id: [13]
+                                       },
+                                       num_common_prefix_blocks=0,
+                                       finished_req_ids=set(),
+                                       free_encoder_input_ids=[],
+                                       structured_output_request_ids={},
+                                       grammar_bitmask=None)
+
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={req.request_id: i
+                         for i, req in enumerate(requests)},
+        sampled_token_ids=[[10, 42, 12],
+                           [13, 14]],  # First request hits stop token
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={})
+
+    scheduler.update_from_output(scheduler_output, model_output)
+
+    # Verify first request stopped on custom token
+    assert len(scheduler.running) == 1
+    assert scheduler.running[0].request_id == requests[1].request_id
+    assert requests[0].status == RequestStatus.FINISHED_STOPPED
+    assert requests[0].stop_reason == 42
+    assert requests[0].request_id in scheduler.finished_req_ids
+    assert list(requests[0].output_token_ids) == [10, 42]
+    assert list(requests[1].output_token_ids) == [13, 14]
+
+    # Test case 3: Stop on max tokens
+    scheduler = create_scheduler(num_speculative_tokens=2)
+    requests = create_requests(num_requests=2, max_tokens=2)
+    for req in requests:
+        req.num_computed_tokens = req.num_tokens
+        scheduler.requests[req.request_id] = req
+        scheduler.running.append(req)
+
+    scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
+                                       scheduled_cached_reqs=[],
+                                       num_scheduled_tokens={
+                                           requests[0].request_id: 3,
+                                           requests[1].request_id: 1
+                                       },
+                                       total_num_scheduled_tokens=4,
+                                       scheduled_encoder_inputs={},
+                                       scheduled_spec_decode_tokens={
+                                           requests[0].request_id: [10, 11],
+                                           requests[1].request_id: []
+                                       },
+                                       num_common_prefix_blocks=0,
+                                       finished_req_ids=set(),
+                                       free_encoder_input_ids=[],
+                                       structured_output_request_ids={},
+                                       grammar_bitmask=None)
+
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={req.request_id: i
+                         for i, req in enumerate(requests)},
+        sampled_token_ids=[[10, 11, 12],
+                           [13]],  # First request exceeds max_tokens
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={})
+
+    scheduler.update_from_output(scheduler_output, model_output)
+
+    # Verify first request stopped due to length
+    assert len(scheduler.running) == 1
+    assert scheduler.running[0].request_id == requests[1].request_id
+    assert requests[0].status == RequestStatus.FINISHED_LENGTH_CAPPED
+    assert requests[0].request_id in scheduler.finished_req_ids
+    assert list(requests[0].output_token_ids) == [10, 11
+                                                  ]  # Truncated to max_tokens
+    assert list(requests[1].output_token_ids) == [13]
+
+    # Test case 4: Ignore EOS flag
+    scheduler = create_scheduler(num_speculative_tokens=2)
+    requests = create_requests(num_requests=1, max_tokens=10)
+    requests[0].sampling_params.ignore_eos = True
+    requests[0].num_computed_tokens = requests[0].num_tokens
+    scheduler.requests[requests[0].request_id] = requests[0]
+    scheduler.running.append(requests[0])
+
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={requests[0].request_id: 3},
+        total_num_scheduled_tokens=3,
+        scheduled_encoder_inputs={},
+        scheduled_spec_decode_tokens={
+            requests[0].request_id: [EOS_TOKEN_ID, 10]
+        },
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None)
+
+    model_output = ModelRunnerOutput(
+        req_ids=[requests[0].request_id],
+        req_id_to_index={requests[0].request_id: 0},
+        sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={})
+
+    scheduler.update_from_output(scheduler_output, model_output)
+
+    # Verify request continues past EOS
+    assert len(scheduler.running) == 1
+    assert not requests[0].is_finished()
+    assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11]
+
+
+@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
+    (None, None),
+    (True, 5),
+])
+def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
+                                     prompt_logprobs: Optional[int]):
+    scheduler = create_scheduler(
+        max_num_batched_tokens=1024,
+        max_num_seqs=2,
+        enable_prefix_caching=enable_prefix_caching,
+        enable_chunked_prefill=True,
+    )
+    requests = create_requests(
+        num_requests=2,
+        num_tokens=512,
+        prompt_logprobs=prompt_logprobs,
+    )
+
+    # Schedule the first request.
+    scheduler.add_request(requests[0])
+    scheduler_output0 = scheduler.schedule()
+    assert len(scheduler_output0.scheduled_new_reqs) == 1
+    assert scheduler_output0.num_scheduled_tokens[
+        requests[0].request_id] == 512
+
+    # The first request is still running, so only schedule the second request.
+    scheduler.add_request(requests[1])
+    scheduler_output1 = scheduler.schedule()
+    assert len(scheduler_output1.scheduled_new_reqs) == 1
+    assert scheduler_output1.num_scheduled_tokens[
+        requests[1].request_id] == 512
+
+    # Model output of the first request.
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[requests[0].request_id],
+        req_id_to_index={requests[0].request_id: 0},
+        sampled_token_ids=[[0]],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(scheduler_output0, model_runner_output)
+
+    # Schedule the next step.
+    # The first request can be scheduled again while the second
+    # request is still running.
+    scheduler_output2 = scheduler.schedule()
+    assert scheduler_output2.num_scheduled_tokens[requests[0].request_id] == 1
+
+    # Model output of the second request.
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[requests[1].request_id],
+        req_id_to_index={requests[1].request_id: 0},
+        sampled_token_ids=[[0]],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(scheduler_output1, model_runner_output)
+
+
+# Note - these test cases mirror some of those in test_rejection_sampler.py
+@pytest.mark.parametrize(
+    "spec_tokens,output_tokens,expected",
+    [
+        ([[1, 2, 3]], [[1, 2, 3, 4]], (1, 3, 3, [1, 1, 1])),  # perfect match
+        ([[1, 2, 3]], [[1, 5]], (1, 3, 1, [1, 0, 0])),  # early mismatch
+        ([[1, 2], [3]], [[1, 2, 5], [3, 4]],
+         (2, 3, 3, [2, 1])),  # multiple sequences
+        ([[1]], [[1, 2]], (1, 1, 1, [1])),  # single token sequence
+        ([[]], [[5]], (0, 0, 0, [0])),  # empty sequence
+        ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
+         (2, 6, 3, [2, 1, 0])),  # multiple mismatches
+    ])
+def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
+    """Test scheduling behavior with speculative decoding.
+
+    This test verifies that:
+    1. Speculated tokens get scheduled correctly
+    2. Spec decoding stats properly count number of draft and accepted tokens
+    """
+    if vllm_version_is("0.9.0"):
+        return
+    num_spec_tokens = max(1, max(len(t) for t in spec_tokens))
+    scheduler = create_scheduler(num_speculative_tokens=num_spec_tokens)
+    requests = create_requests(num_requests=len(spec_tokens), num_tokens=1)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    # Schedule a decode, which will also draft speculative tokens
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.total_num_scheduled_tokens == len(requests)
+    for i in range(len(requests)):
+        req_id = requests[i].request_id
+        assert output.num_scheduled_tokens[req_id] == 1
+        assert req_id not in output.scheduled_spec_decode_tokens
+
+    model_runner_output = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0] for _ in range(len(requests))],
+        spec_token_ids=spec_tokens,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    engine_core_outputs = scheduler.update_from_output(output,
+                                                       model_runner_output)
+
+    for i in range(len(requests)):
+        running_req = scheduler.running[i]
+        # The prompt token
+        assert running_req.num_computed_tokens == 1
+        # The prompt token and the sampled token
+        assert running_req.num_tokens == 2
+        # The prompt token, the sampled token, and the speculated tokens
+        assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
+
+    # No draft or accepted tokens counted yet
+    assert not engine_core_outputs or (
+        engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None)
+
+    # Schedule the speculated tokens for validation
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 0
+    # The sampled token and speculated tokens
+    assert output.total_num_scheduled_tokens == \
+        len(requests) + sum(len(ids) for ids in spec_tokens)
+    for i in range(len(requests)):
+        req_id = requests[i].request_id
+        assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i])
+        if spec_tokens[i]:
+            assert len(output.scheduled_spec_decode_tokens[req_id]) == \
+                len(spec_tokens[i])
+        else:
+            assert req_id not in output.scheduled_spec_decode_tokens
+
+    model_runner_output = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=output_tokens,
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    engine_core_outputs = scheduler.update_from_output(output,
+                                                       model_runner_output)
+
+    scheduler_stats = engine_core_outputs[0].scheduler_stats \
+        if engine_core_outputs else None
+    if expected[0] == 0:
+        assert scheduler_stats.spec_decoding_stats is None  # type: ignore
+    else:
+        assert scheduler_stats.spec_decoding_stats is not None  # type: ignore
+        stats = scheduler_stats.spec_decoding_stats  # type: ignore
+        assert stats.num_drafts == expected[0]
+        assert stats.num_draft_tokens == expected[1]
+        assert stats.num_accepted_tokens == expected[2]
+        assert stats.num_accepted_tokens_per_pos == expected[3]
+
+
+def _assert_right_scheduler_output(
+    output: SchedulerOutput,
+    num_requests: int,
+    expected_num_scheduled_tokens: int,
+):
+    """Check if SchedulerOutput is correct after remote KV cache hit."""
+
+    # We should inject the kv_connector_metadata.
+    assert len(output.kv_connector_metadata.requests) == num_requests
+
+    # Only num_tokens - matched_num_new_tokens should be scheduled.
+    for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
+        assert num_scheduled_tokens == expected_num_scheduled_tokens
+
+
+def _assert_right_kv_cache_manager(
+    scheduler: AscendScheduler,
+    req_ids: list[str],
+    num_tokens: int,
+    block_size: int,
+    num_requests: int,
+    num_total_blocks: int,
+):
+    """Check whether KVCacheManager is correct after allocate."""
+
+    # Make sure the request stats are right.
+    EXPECTED_TOTAL_BLOCKS = num_tokens // block_size
+    for req_id in req_ids:
+        blocks = (scheduler.kv_cache_manager.coordinator.
+                  single_type_managers[0].req_to_blocks[req_id])
+        hashes = scheduler.kv_cache_manager.req_to_block_hashes[req_id]
+        assert (scheduler.kv_cache_manager.coordinator.single_type_managers[0].
+                num_cached_block[req_id] == EXPECTED_TOTAL_BLOCKS)
+        assert len(blocks) == EXPECTED_TOTAL_BLOCKS
+        assert len(hashes) == EXPECTED_TOTAL_BLOCKS
+
+    # Make sure we actually touched all the blocks.
+    BLOCKS_PER_REQ = num_tokens / block_size
+    assert (scheduler.kv_cache_manager.block_pool.get_num_free_blocks() ==
+            num_total_blocks - num_requests * BLOCKS_PER_REQ)
+
+
+def _step_until_done(
+    scheduler: AscendScheduler,
+    output: SchedulerOutput,
+    model_runner_output: ModelRunnerOutput,
+):
+    """Loop over schedule(), update_from_output() until finished."""
+
+    all_finished = False
+    _ = scheduler.update_from_output(output, model_runner_output)
+    while not all_finished:
+        # Schedule + a few iterations until stopping.
+        output = scheduler.schedule()
+        assert len(scheduler.running)
+        for _, num_scheduled_tokens in output.num_scheduled_tokens.items():
+            # We should be in the decode phase now.
+            assert num_scheduled_tokens == 1
+        assert len(output.kv_connector_metadata.requests) == 0
+        ecos = scheduler.update_from_output(output, model_runner_output)[0]
+        all_done = True
+        for eco in ecos.outputs:
+            if eco.finish_reason is None:
+                all_done = False
+        all_finished = all_done
+
+
+def make_output(scheduler: AscendScheduler):
+    return ModelRunnerOutput(
+        req_ids=[req.request_id for req in scheduler.running],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(scheduler.running)
+        },
+        sampled_token_ids=[[1000]] * len(scheduler.running),
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+
+
+def assert_scheduler_empty(scheduler: AscendScheduler):
+    """Confirm the scheduler is "empty" - i.e. no leaks."""
+    # Scheduler Metadata.
+    assert len(scheduler.requests) == 0
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 0
+    assert len(scheduler.finished_req_ids) == 0
+    assert len(scheduler._cached_reqs_data) == 0
+
+    # EncoderCacheManager.
+    assert len(scheduler.encoder_cache_manager.freed) == 0
+    assert len(scheduler.encoder_cache_manager.cached) == 0
+
+    # KVCache Manager.
+    if not vllm_version_is("0.9.0"):
+        assert len(scheduler.kv_cache_manager.coordinator.
+                   single_type_managers[0].req_to_blocks) == 0
+        assert len(scheduler.kv_cache_manager.coordinator.
+                   single_type_managers[0].num_cached_block) == 0
+    assert len(scheduler.kv_cache_manager.req_to_block_hashes) == 0
+    num_free_blocks = (
+        scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks)
+    assert num_free_blocks == (
+        scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1)
+
+    # NOTE(rob): just the ref count on blocks will be 0. The hash
+    # value, etc will remain since we lazily evict for prefix cache.
+    for block in scheduler.kv_cache_manager.block_pool.blocks:
+        assert block.ref_cnt == 0
+
+
+def test_memory_leak():
+    """Test that we do not have a memory leak."""
+
+    scheduler = create_scheduler(enable_prefix_caching=True)
+
+    NUM_REQUESTS = 5
+    NUM_TOKENS = 10
+    MAX_TOKENS = 10
+    requests = create_requests(num_requests=NUM_REQUESTS,
+                               num_tokens=NUM_TOKENS,
+                               max_tokens=MAX_TOKENS)
+
+    # Add each request.
+    for request in requests:
+        scheduler.add_request(request)
+        scheduler_output = scheduler.schedule()
+        model_runner_output = make_output(scheduler)
+        scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # Iterate until done.
+    while True:
+        scheduler_output = scheduler.schedule()
+        if len(scheduler.running) == 0:
+            break
+        model_runner_output = make_output(scheduler)
+        scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # Confirm no memory leak.
+    assert_scheduler_empty(scheduler)
\ No newline at end of file
diff --git a/tests/singlecard/core/test_ascend_scheduler_e2e.py b/tests/singlecard/core/test_ascend_scheduler_e2e.py
new file mode 100644
index 000000000..668dafced
--- /dev/null
+++ b/tests/singlecard/core/test_ascend_scheduler_e2e.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+import pytest
+from vllm import LLM
+
+if os.getenv("VLLM_USE_V1", "0") != "1":
+    pytest.skip("Test package requires V1", allow_module_level=True)
+
+MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
+PROMPT = "Hello my name is Robert and I"
+
+
+@pytest.fixture(scope="module")
+def model() -> LLM:
+    return LLM(
+        MODEL,
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        max_num_batched_tokens=200,
+        max_num_seqs=3,
+        additional_config={"ascend_scheduler_config": {
+            "enabled": True,
+        }})
+
+
+def test_concurrent_partial_prefill(model):
+    outputs = model.generate([PROMPT] * 3)
+    assert len(outputs) == 3
+    for output in outputs:
+        assert len(output.outputs) == 1
+
+
+def test_prefix_cache_stats_is_recorded(model):
+    # 17 tokens will make sure first 16 tokens are cached in a block
+    input_tokens = {"prompt_token_ids": [101] * 129}
+    _ = model.generate([input_tokens])
+    outputs = model.generate([input_tokens])
+    assert outputs[0].num_cached_tokens == 128
\ No newline at end of file
diff --git a/tests/singlecard/ops/test_rotary_embedding.py b/tests/singlecard/ops/test_rotary_embedding.py
index 2d5ec18da..a3504a88b 100644
--- a/tests/singlecard/ops/test_rotary_embedding.py
+++ b/tests/singlecard/ops/test_rotary_embedding.py
@@ -10,7 +10,9 @@
 import torch
 import torch.nn as nn
 
-import vllm_ascend.platform  # noqa: F401
+from vllm_ascend.utils import enable_custom_op
+
+enable_custom_op()
 
 # Only Neox style true scenario is supported for now
 IS_NEOX_STYLE = [True]
diff --git a/tests/singlecard/test_ascend_config.py b/tests/singlecard/test_ascend_config.py
index 484fe5f70..63484d4a0 100644
--- a/tests/singlecard/test_ascend_config.py
+++ b/tests/singlecard/test_ascend_config.py
@@ -58,7 +58,8 @@ def test_run_with_ascend_config():
             "use_cached_graph": True,
             "graph_batch_sizes": [1, 2, 4, 8],
             "graph_batch_sizes_init": False,
-            "enable_multistream_shared_expert": True,
+            "enable_multistream_moe": True,
+            "enable_multistream_mla": True,
         },
         "ascend_scheduler_config": {
             "enabled": True,
@@ -79,7 +80,8 @@ def test_run_with_ascend_config():
             1, 2, 4, 8
         ]
         assert not ascend_config.torchair_graph_config.graph_batch_sizes_init
-        assert ascend_config.torchair_graph_config.enable_multistream_shared_expert
+        assert ascend_config.torchair_graph_config.enable_multistream_mla
+        assert ascend_config.torchair_graph_config.enable_multistream_moe
         assert ascend_config.ascend_scheduler_config.enabled
         assert ascend_config.ascend_scheduler_config.enable_chunked_prefill
         assert ascend_config.expert_tensor_parallel_size == 1
diff --git a/tests/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py
index 553d109f3..572649f9a 100644
--- a/tests/singlecard/test_offline_inference.py
+++ b/tests/singlecard/test_offline_inference.py
@@ -37,6 +37,10 @@
     "Qwen/Qwen3-0.6B-Base",
 ]
 MULTIMODALITY_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"]
+
+QUANTIZATION_MODELS = [
+    "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8",
+]
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 
 QUANTIZATION_MODELS = [
diff --git a/tests/singlecard/test_scheduler.py b/tests/singlecard/test_scheduler.py
index d1c606278..8021f0306 100644
--- a/tests/singlecard/test_scheduler.py
+++ b/tests/singlecard/test_scheduler.py
@@ -31,7 +31,6 @@
 from vllm.v1.structured_output import StructuredOutputManager
 
 from vllm_ascend.core.scheduler import AscendScheduler
-from vllm_ascend.utils import vllm_version_is
 
 EOS_TOKEN_ID = 50256
 
@@ -87,27 +86,15 @@ def create_scheduler(
     vllm_config = VllmConfig(scheduler_config=scheduler_config,
                              model_config=model_config,
                              cache_config=cache_config)
-
-    if vllm_version_is("0.9.0"):
-        kv_cache_config = KVCacheConfig(
-            num_blocks=10000,  # A large number of blocks to hold all requests
-            tensors={},
-            kv_cache_groups=[
-                KVCacheGroupSpec(['layer'],
-                                 FullAttentionSpec(16, 1, 1, torch.float32,
-                                                   False))
-            ],
-        )
-    else:
-        kv_cache_config = KVCacheConfig(
-            num_blocks=10000,  # A large number of blocks to hold all requests
-            kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
-            kv_cache_groups=[
-                KVCacheGroupSpec(['layer'],
-                                 FullAttentionSpec(16, 1, 1, torch.float32,
-                                                   False, None))
-            ],
-        )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=10000,  # A large number of blocks to hold all requests
+        kv_cache_tensors=[KVCacheTensor(size=1024, shared_by=[1])],
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(16, 1, 1, torch.float32, False,
+                                               None))
+        ],
+    )
     cache_config.num_gpu_blocks = 10000
     return AscendScheduler(
         vllm_config,
@@ -135,27 +122,15 @@ def create_requests(num_requests: int,
         else:
             mm_position = None
             mm_inputs = None
-        if vllm_version_is("0.9.0"):
-            request = Request(
-                request_id=f"{i}",
-                prompt_token_ids=[i] * num_tokens,
-                sampling_params=sampling_params,
-                multi_modal_inputs=mm_inputs,
-                multi_modal_placeholders=mm_position,
-                multi_modal_hashes=None,
-                arrival_time=0,
-                eos_token_id=EOS_TOKEN_ID,
-            )
-        else:
-            request = Request(
-                request_id=f"{i}",
-                prompt_token_ids=[i] * num_tokens,
-                sampling_params=sampling_params,
-                multi_modal_inputs=mm_inputs,
-                multi_modal_placeholders=mm_position,
-                multi_modal_hashes=None,
-                eos_token_id=EOS_TOKEN_ID,
-            )
+        request = Request(
+            request_id=f"{i}",
+            prompt_token_ids=[i] * num_tokens,
+            sampling_params=sampling_params,
+            multi_modal_inputs=mm_inputs,
+            multi_modal_placeholders=mm_position,
+            multi_modal_hashes=None,
+            eos_token_id=EOS_TOKEN_ID,
+        )
         requests.append(request)
     return requests
 
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
index d42f145e4..defa7fd3d 100644
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -56,10 +56,13 @@ def __init__(self, torchair_graph_config):
             "graph_batch_sizes", [])
         self.graph_batch_sizes_init = torchair_graph_config.get(
             "graph_batch_sizes_init", False)
-        self.enable_multistream_shared_expert = torchair_graph_config.get(
-            "enable_multistream_shared_expert", False)
+        self.enable_multistream_mla = torchair_graph_config.get(
+            "enable_multistream_mla", False)
+        self.enable_multistream_moe = torchair_graph_config.get(
+            "enable_multistream_moe", False)
         self.enable_view_optimize = torchair_graph_config.get(
             "enable_view_optimize", True)
+        self.enable_kv_nz = torchair_graph_config.get("enable_kv_nz", False)
 
         if not isinstance(self.graph_batch_sizes, list):
             raise TypeError("graph_batch_sizes must be list[int]")
diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py
index 8f130e424..9c112ed62 100644
--- a/vllm_ascend/attention/attention.py
+++ b/vllm_ascend/attention/attention.py
@@ -36,10 +36,12 @@
 
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ops.cache import concat_and_cache_mla
-from vllm_ascend.platform import CUSTOM_OP_ENABLED
+from vllm_ascend.utils import enable_custom_op
 from vllm_ascend.worker.model_runner import (
     ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
 
+_ALLOWED_NUM_QUERIES_PER_KV = [32, 64, 128]
+
 
 def generate_attn_mask(max_seq_len: int, dtype=torch.float16, mask_value=None):
     # Construct lower triangle matrix.
@@ -460,7 +462,7 @@ def advance_step(self,
         for i in range(num_queries):
             self.seq_lens[i] += 1
         self.max_decode_seq_len = max(self.seq_lens)
-        if CUSTOM_OP_ENABLED:
+        if enable_custom_op():
             #advance a step on NPU for existing inputs for a multi-step runner if custom ops is enabled
             torch.ops._C.advance_step_flashattn_ascendc(
                 num_seqs=num_seqs,
@@ -1005,6 +1007,15 @@ def __init__(
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
 
+        # TODO: support numHeads / numKvHeads < 16 in MLA kernel
+        if self.torchair_graph_enabled:
+            assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \
+                ("The allowed number of queries per kv when enabling both MLA and Graph mode"
+                " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite,"
+                " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1,"
+                " please make sure after the tensor parallel split, num_heads / num_kv_heads in "
+                "{32, 64, 128}.")
+
     def exec_kv(
         self,
         hidden_states: torch.Tensor,
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
index 70fb44df8..43cb71c68 100644
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -14,11 +14,13 @@
 from vllm.utils import cdiv, round_down
 
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.attention.attention import _ALLOWED_NUM_QUERIES_PER_KV
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
 from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig
 from vllm_ascend.multistream.context import get_multistream_comm_context
 from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
 from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
+from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -551,15 +553,29 @@ def __init__(
         self.o_proj = kwargs['o_proj']
         self.kv_a_proj_with_mqa = kwargs.get('kv_a_proj_with_mqa', None)
         self.kv_a_layernorm = kwargs.get('kv_a_layernorm', None)
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz
+        self.enable_multistream_mla = \
+            ascend_config.torchair_graph_config.enable_multistream_mla
+
         # Adapt torch air graph mode with spec decoding.
         speculative_config = get_current_vllm_config().speculative_config
         if speculative_config is not None:
             self.spec_token_num = speculative_config.num_speculative_tokens
             assert self.spec_token_num > 0
 
+        # TODO: support numHeads / numKvHeads < 16 in MLA kernel
+        if self.torchair_graph_enabled:
+            assert self.num_queries_per_kv in _ALLOWED_NUM_QUERIES_PER_KV, \
+                ("The allowed number of queries per kv when enabling both MLA and Graph mode"
+                " only support {32, 64, 128}, Thus this is not supported for DeepSeek-V2-Lite,"
+                " as it only has 16 attention heads. And if you're using DeepSeek-V3 or DeepSeek-R1,"
+                " please make sure after the tensor parallel split, num_heads / num_kv_heads in "
+                "{32, 64, 128}.")
+
     def _v_up_proj_and_o_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
@@ -848,7 +864,40 @@ def exec_kv(
         kv = self.kv_a_proj_with_mqa(hidden_states)[0]
         # npu_kv_rmsnorm_rope_cache needs [B, N, S, D]
         kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim)
-        k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache(
+        cache_mode = "PA_NZ" if self.enable_kv_nz else "PA"
+        with npu_stream_switch("mla_secondary",
+                               0,
+                               enabled=self.enable_multistream_mla):
+            k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache(
+                kv,
+                self.kv_a_layernorm.weight,
+                cos,
+                sin,
+                slots.to(torch.int64),
+                kv_cache[1],
+                kv_cache[0],
+                epsilon=self.kv_a_layernorm.variance_epsilon,
+                cache_mode=cache_mode,
+            )
+        return k_pe, k_nope
+
+    def exec_kv_prefill(
+        self,
+        hidden_states: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+        kv_cache: Tuple,
+        slots: torch.Tensor,
+    ):
+
+        B = hidden_states.shape[0]
+        N = self.num_kv_heads
+        S = 1
+        kv = self.kv_a_proj_with_mqa(hidden_states)[0]
+        # npu_kv_rmsnorm_rope_cache needs [B, N, S, D]
+        kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim)
+        cache_mode = "PA_BLK_NZ" if self.enable_kv_nz else "PA"
+        _, _, k_pe, k_nope = torch_npu.npu_kv_rmsnorm_rope_cache(
             kv,
             self.kv_a_layernorm.weight,
             cos,
@@ -857,7 +906,8 @@ def exec_kv(
             kv_cache[1],
             kv_cache[0],
             epsilon=self.kv_a_layernorm.variance_epsilon,
-            cache_mode="PA",
+            cache_mode=cache_mode,
+            is_output_kv=True,
         )
         return k_pe, k_nope
 
@@ -895,34 +945,42 @@ def _forward_decode(
             # TorchAir's shape is [bs, num_heads_per_rank, q_seq_len, dim]
             if attn_metadata.attn_state == AscendAttentionState.SpecDecoding:
                 assert num_tokens % self.spec_token_num == 0
-                q_nope = (q_nope.view(
-                    num_tokens // (self.spec_token_num + 1),
-                    self.spec_token_num + 1,
-                    self.num_heads,
-                    -1,
-                ).transpose(1, 2).contiguous())
-                q_pe = (q_pe.view(
-                    num_tokens // (self.spec_token_num + 1),
-                    self.spec_token_num + 1,
-                    self.num_heads,
-                    -1,
-                ).transpose(1, 2).contiguous())
+                q_nope = q_nope.view(num_tokens // (self.spec_token_num + 1),
+                                     self.spec_token_num + 1, self.num_heads,
+                                     -1)
+                q_pe = q_pe.view(num_tokens // (self.spec_token_num + 1),
+                                 self.spec_token_num + 1, self.num_heads, -1)
+                if not self.enable_kv_nz:
+                    q_nope = q_nope.transpose(1, 2).contiguous()
+                    q_pe = q_pe.transpose(1, 2).contiguous()
                 sparse_mode = 3
                 spec_attn_mask = attn_metadata.decode.attn_mask  # type:ignore
             else:
-                q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1)
-                q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1)
+                if self.enable_kv_nz:
+                    q_nope = q_nope.view(num_tokens, 1, self.num_heads, -1)
+                    q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1)
+                else:
+                    q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1)
+                    q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1)
                 sparse_mode = 0
                 spec_attn_mask = None
             # shape of knope/k_pe for npu graph mode should be:
             # [num_blocks, num_kv_heads, block_size, self.kv_lora_rank/self.qk_rope_head_dim]
             block_size = kv_c_and_k_pe_cache[0].shape[1]
-            k_nope = k_nope.view(-1, self.num_kv_heads, block_size,
-                                 self.kv_lora_rank)
-            k_pe = k_pe.view(-1, self.num_kv_heads, block_size,
-                             self.qk_rope_head_dim)
+            if self.enable_kv_nz:
+                k_nope = k_nope.view(-1, self.num_kv_heads,
+                                     self.kv_lora_rank // 16, block_size, 16)
+                k_pe = k_pe.view(-1, self.num_kv_heads,
+                                 self.qk_rope_head_dim // 16, block_size, 16)
+                input_layout = "BSND"
+            else:
+                k_nope = k_nope.view(-1, self.num_kv_heads, block_size,
+                                     self.kv_lora_rank)
+                k_pe = k_pe.view(-1, self.num_kv_heads, block_size,
+                                 self.qk_rope_head_dim)
+                input_layout = "BNSD"
 
-            attn_output, _ = torch.ops.npu.npu_fused_infer_attention_score(
+            attn_output, _ = torch_npu.npu_fused_infer_attention_score(
                 q_nope,
                 k_nope,
                 k_nope,
@@ -930,7 +988,7 @@ def _forward_decode(
                 key_rope=k_pe,
                 num_heads=self.num_heads,
                 num_key_value_heads=self.num_kv_heads,
-                input_layout="BNSD",
+                input_layout=input_layout,
                 atten_mask=spec_attn_mask,
                 sparse_mode=sparse_mode,
                 scale=self.scale,
@@ -979,10 +1037,11 @@ def forward(
         ]
         num_actual_toks = attn_metadata.num_actual_tokens
         if k_pe is None and not self.running_in_graph:
-            kv_c, k_pe = self.kv_a_proj_with_mqa(
-                hidden_states_or_kv_c_normed)[0].split(
-                    [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-            kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
+            if not self.torchair_graph_enabled:
+                kv_c, k_pe = self.kv_a_proj_with_mqa(
+                    hidden_states_or_kv_c_normed)[0].split(
+                        [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+                kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
         else:
             kv_c_normed = hidden_states_or_kv_c_normed
         assert attn_metadata.num_decodes is not None and \
@@ -995,38 +1054,55 @@ def forward(
             # Inputs and outputs may be padded for CUDA graphs
             output_padded = output
             output = output[:num_actual_toks, ...]
-            kv_c_normed = kv_c_normed[:num_actual_toks, ...]
-            prefill_k_c_normed = kv_c_normed[num_decode_tokens:]
+            if not self.torchair_graph_enabled:
+                kv_c_normed = kv_c_normed[:num_actual_toks, ...]
+                prefill_k_c_normed = kv_c_normed[num_decode_tokens:]
         if not self.running_in_graph:
             hidden_states_or_q_c = hidden_states_or_q_c[:num_actual_toks, ...]
-            decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens]
             prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:]
-            k_pe = k_pe[:num_actual_toks, ...]
-            k_pe = k_pe.unsqueeze(1)
-            decode_k_pe = k_pe[:num_decode_tokens]
-            prefill_k_pe = k_pe[num_decode_tokens:]
+            if not self.torchair_graph_enabled:
+                decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens]
+                k_pe = k_pe[:num_actual_toks, ...]
+                k_pe = k_pe.unsqueeze(1)
+                decode_k_pe = k_pe[:num_decode_tokens]
+                prefill_k_pe = k_pe[num_decode_tokens:]
         else:
             decode_hs_or_q_c = hidden_states_or_q_c
         if has_decode:
             decode_k_nope = None
             assert attn_metadata.decode is not None
-            decode_ql_nope, decode_q_pe = \
-                self._q_proj_and_k_up_proj(decode_hs_or_q_c)
             if self.running_in_graph:
                 seq_len = self.rotary_emb.max_position_embeddings
                 cos = self.rotary_emb.cos_cached[:seq_len].to(
-                    dtype=decode_q_pe.dtype)
+                    dtype=decode_hs_or_q_c.dtype)
                 sin = self.rotary_emb.sin_cached[:seq_len].to(
-                    dtype=decode_q_pe.dtype)
+                    dtype=decode_hs_or_q_c.dtype)
                 cos = cos[attn_metadata.decode.input_positions]
                 sin = sin[attn_metadata.decode.input_positions]
                 cos = cos[:, None, None, :]
                 sin = sin[:, None, None, :]
-
-                decode_q_pe = self.rope_single(decode_q_pe, cos, sin)
+                # Without explicitly controlling the order, IndexByTensor operations
+                # would be placed after `matmul W_KV_T` hindering the overlapping of
+                # KvRmsNormRopeCache and SingleRope.
+                npu_wait_tensor(decode_hs_or_q_c,
+                                cos,
+                                enabled=self.enable_multistream_mla)
+                npu_wait_tensor(decode_hs_or_q_c,
+                                sin,
+                                enabled=self.enable_multistream_mla)
+            decode_ql_nope, decode_q_pe = \
+                self._q_proj_and_k_up_proj(decode_hs_or_q_c)
+            if self.running_in_graph:
                 decode_k_pe, decode_k_nope = self.exec_kv(
                     hidden_states_or_kv_c_normed, cos, sin, kv_cache,
                     attn_metadata.slot_mapping)
+                with npu_stream_switch("mla_secondary",
+                                       0,
+                                       enabled=self.enable_multistream_mla):
+                    npu_wait_tensor(decode_q_pe,
+                                    decode_k_pe,
+                                    enabled=self.enable_multistream_mla)
+                    decode_q_pe = self.rope_single(decode_q_pe, cos, sin)
             else:
                 decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
                     attn_metadata.decode.input_positions,
@@ -1041,22 +1117,25 @@ def forward(
             prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim]
             if self.torchair_graph_enabled:
                 num_tokens = prefill_hs_or_q_c.shape[0]
+                seq_len = self.rotary_emb.max_position_embeddings
+                cos = self.rotary_emb.cos_cached[:seq_len].to(
+                    dtype=prefill_q_pe.dtype)
+                sin = self.rotary_emb.sin_cached[:seq_len].to(
+                    dtype=prefill_q_pe.dtype)
+                cos = cos[attn_metadata.prefill.input_positions]
+                sin = sin[attn_metadata.prefill.input_positions]
+                cos = cos[:, None, None, :]
+                sin = sin[:, None, None, :]
+
+                prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin)
+                prefill_k_pe, prefill_k_nope = self.exec_kv_prefill(
+                    hidden_states_or_kv_c_normed, cos, sin, kv_cache,
+                    attn_metadata.slot_mapping)
+
+                kv_c_normed = prefill_k_nope[:num_actual_toks, ...]
+                prefill_k_c_normed = prefill_k_nope[num_decode_tokens:]
                 prefill_k_pe = prefill_k_pe.view(num_tokens, self.num_kv_heads,
                                                  -1)
-                if self.rotary_emb.__class__.__name__ == 'RotaryEmbedding':
-                    # NOTE: When scaling not specified
-                    ori_q_pe_shape, ori_k_pe_shape = prefill_q_pe.shape, prefill_k_pe.shape
-                    prefill_q_pe = prefill_q_pe.reshape(num_tokens, -1)
-                    prefill_k_pe = prefill_k_pe.reshape(num_tokens, -1)
-                    prefill_q_pe, prefill_k_pe = self.rotary_emb(
-                        attn_metadata.prefill.input_positions, prefill_q_pe,
-                        prefill_k_pe)
-                    prefill_q_pe = prefill_q_pe.view(ori_q_pe_shape)
-                    prefill_k_pe = prefill_k_pe.view(ori_k_pe_shape)
-                else:
-                    prefill_q_pe, prefill_k_pe = self.rotary_emb(
-                        attn_metadata.prefill.input_positions, prefill_q_pe,
-                        prefill_k_pe)
                 prefill_q = torch.cat([prefill_q_nope, prefill_q_pe], dim=-1)
             else:
                 prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb(
diff --git a/vllm_ascend/compilation/piecewise_backend.py b/vllm_ascend/compilation/piecewise_backend.py
index 95ce69393..c6a800b3d 100644
--- a/vllm_ascend/compilation/piecewise_backend.py
+++ b/vllm_ascend/compilation/piecewise_backend.py
@@ -31,8 +31,6 @@
 from vllm.logger import logger
 from vllm.utils import weak_ref_tensors
 
-from vllm_ascend.utils import vllm_version_is
-
 
 @dataclasses.dataclass
 class ConcreteSizeEntry:
@@ -206,11 +204,7 @@ def __call__(self, *args) -> Any:
             # to save memory
             entry.output = weak_ref_tensors(output)
             entry.aclgraph = aclgraph
-
-            if vllm_version_is("0.9.0"):
-                compilation_counter.num_cudagraph_caputured += 1
-            else:
-                compilation_counter.num_cudagraph_captured += 1
+            compilation_counter.num_cudagraph_captured += 1
 
             # important: we need to return the output, rather than
             # the weak ref of the output, so that pytorch can correctly
diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py
index 42f5d9c69..2fa31c264 100644
--- a/vllm_ascend/core/scheduler.py
+++ b/vllm_ascend/core/scheduler.py
@@ -14,23 +14,24 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
+import time
 from collections import deque
 from typing import Iterable, Union
 
 from vllm.config import VllmConfig
+from vllm.distributed.kv_events import KVEventBatch
 from vllm.logger import logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.utils import cdiv
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
-from vllm.v1.engine import EngineCoreOutputs
+from vllm.v1.engine import EngineCoreEventType, EngineCoreOutputs
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 
-from vllm_ascend.utils import vllm_version_is
-
 
 class AscendScheduler(Scheduler):
     """This Scheduler extends vllm's original v1 scheduler
@@ -51,11 +52,6 @@ def __init__(
         self.scheduled_req_ids: set[str] = set()
         self.running: list[Request] = []
 
-        if self.vllm_config.kv_transfer_config is not None and \
-            self.vllm_config.kv_transfer_config.is_kv_consumer:
-            raise ValueError(
-                "AscendScheduler cannot be used for decode nodes. ")
-
     def schedule(self) -> SchedulerOutput:
         if self.scheduler_config.chunked_prefill_enabled:
             return super().schedule()
@@ -70,6 +66,9 @@ def schedule(self) -> SchedulerOutput:
         # Spec decode-related.
         scheduled_spec_decode_tokens: dict[str, list[int]] = {}
 
+        # For logging.
+        scheduled_timestamp = time.monotonic()
+
         # Record scheduled LoRA requests.
         scheduled_loras: set[int] = set()
 
@@ -88,6 +87,18 @@ def skip_cur_request():
                 self.waiting.popleft()
                 skipped_waiting_requests.appendleft(request)
 
+            num_prealloc_computed_tokens = 0
+            # P/D: skip request if still waiting for remote kvs.
+            if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
+                is_ready = self._update_waiting_for_remote_kv(request)
+                if is_ready:
+                    request.status = RequestStatus.WAITING
+                    num_prealloc_computed_tokens = (
+                        request.num_computed_tokens)
+                else:
+                    skip_cur_request()
+                    continue
+
             # Check that adding the request still respects the max_loras
             # constraint.
             if (self.lora_config and request.lora_request and
@@ -97,42 +108,70 @@ def skip_cur_request():
                 skip_cur_request()
                 continue
 
-            prompt_limit = self._get_prompt_limit(request)
-            # Get already-cached tokens.
-            computed_blocks, num_computed_tokens = (
-                self.kv_cache_manager.get_computed_blocks(request))
-            num_new_tokens = request.num_tokens - num_computed_tokens
-            if (0 < self.scheduler_config.long_prefill_token_threshold <
-                    num_new_tokens):
-                num_new_tokens = (
-                    self.scheduler_config.long_prefill_token_threshold)
-            max_tokens_in_kvcache = (self.kv_cache_config.num_blocks *
-                                     self.block_size)
-            prompt_limit = min(prompt_limit, max_tokens_in_kvcache)
-
-            # Finish request that exceeds prompt_limit or kv cache size.
-            if num_new_tokens > prompt_limit:
-                logger.warning(
-                    "Input prompt (%d tokens) is too long"
-                    " and exceeds limit of %d",
-                    num_new_tokens,
-                    prompt_limit,
-                )
-                request.status = RequestStatus.FINISHED_IGNORED
-                self.finished_req_ids.add(request.request_id)  # type: ignore
-                self.waiting.popleft()
-                continue
-
-            if num_new_tokens > token_budget:
-                # Scheduling would exceed token_budget, skip.
-                skip_cur_request()
-                continue
-
-            assert num_new_tokens > 0
+            num_external_computed_tokens = 0
+            load_kv_async = False
 
-            if vllm_version_is("0.9.0"):
-                blocks = computed_blocks.blocks
+            # Get already-cached tokens.
+            if num_prealloc_computed_tokens == 0:
+                new_computed_blocks, num_native_computed_tokens = \
+                    self.kv_cache_manager.get_computed_blocks(
+                        request)
+
+                # Get externally-cached tokens if using a KVConnector.
+                if self.connector is not None:
+                    num_external_computed_tokens, load_kv_async = (
+                        self.connector.get_num_new_matched_tokens(
+                            request, num_native_computed_tokens))
+
+                # Total computed tokens (local + external).
+                num_computed_tokens = (num_native_computed_tokens +
+                                       num_external_computed_tokens)
+            else:
+                # P/D: skip checking prefix cache if loaded from remote kvs.
+                new_computed_blocks = KVCacheBlocks.create_empty()
+                num_native_computed_tokens = 0
+
+                # Total computed tokens (allocated in prior step).
+                num_computed_tokens = num_prealloc_computed_tokens
+
+            # P/D: loading remote KV, do not allocate for new work.
+            if load_kv_async:
+                assert num_external_computed_tokens > 0
+                num_new_tokens = 0
+                blocks = None
+            # Number of tokens to be scheduled.
             else:
+                prompt_limit = self._get_prompt_limit(request)
+                # Get already-cached tokens.
+                computed_blocks, num_computed_tokens = (
+                    self.kv_cache_manager.get_computed_blocks(request))
+                # We use `request.num_tokens` instead of
+                # `request.num_prompt_tokens` to consider the resumed
+                # requests, which have output tokens.
+                num_new_tokens = request.num_tokens - num_computed_tokens
+                max_tokens_in_kvcache = (self.kv_cache_config.num_blocks *
+                                         self.block_size)
+                prompt_limit = min(prompt_limit, max_tokens_in_kvcache)
+
+                # Finish request that exceeds prompt_limit or kv cache size.
+                if num_new_tokens > prompt_limit:
+                    logger.warning(
+                        "Input prompt (%d tokens) is too long"
+                        " and exceeds limit of %d",
+                        num_new_tokens,
+                        prompt_limit,
+                    )
+                    request.status = RequestStatus.FINISHED_IGNORED
+                    self.finished_req_ids.add(  # type: ignore
+                        request.request_id)  # type: ignore
+                    self.waiting.popleft()
+                    continue
+
+                if num_new_tokens > token_budget:
+                    # Scheduling would exceed token_budget, skip.
+                    skip_cur_request()
+                    continue
+                assert num_new_tokens > 0
                 blocks = computed_blocks.blocks[0]
 
             watermark = getattr(self.scheduler_config, "watermark", 0.01)
@@ -143,13 +182,38 @@ def skip_cur_request():
                 continue
 
             new_blocks = self.kv_cache_manager.allocate_slots(
-                request, num_new_tokens, new_computed_blocks=computed_blocks)
+                request,
+                num_new_tokens + num_external_computed_tokens,
+                num_native_computed_tokens,
+                new_computed_blocks=computed_blocks,
+                num_lookahead_tokens=self.num_lookahead_tokens,
+                delay_cache_blocks=load_kv_async)
             if new_blocks is None:
                 # The request cannot be scheduled.
                 break
 
+            # KVConnector: update internal state after allocation.
+            # This information is used to determine if a load is
+            # needed for this request.
+            if num_external_computed_tokens:
+                assert self.connector is not None
+                self.connector.update_state_after_alloc(
+                    request,
+                    new_computed_blocks + new_blocks,
+                    num_external_computed_tokens,
+                )
+
             self.waiting.popleft()
+            if load_kv_async:
+                # If loading async, allocate memory and put request
+                # into the WAITING_FOR_REMOTE_KV state.
+                skipped_waiting_requests.appendleft(request)
+                request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
+                continue
             self.running.append(request)
+            if self.log_stats:
+                request.record_event(EngineCoreEventType.SCHEDULED,
+                                     scheduled_timestamp)
             self.scheduled_req_ids.add(request.request_id)
             # Check request status.
             if request.status == RequestStatus.WAITING:
@@ -168,6 +232,9 @@ def skip_cur_request():
             token_budget -= num_new_tokens
             request.status = RequestStatus.RUNNING
             request.num_computed_tokens = num_computed_tokens
+            # Count the number of prifix cached tokens.
+            if request.num_cached_tokens < 0:
+                request.num_cached_tokens = num_computed_tokens
 
         # Put back any skipped requests at the head of the waiting queue
         if skipped_waiting_requests:
@@ -186,16 +253,45 @@ def skip_cur_request():
 
                 num_new_tokens = (request.num_tokens_with_spec -
                                   request.num_computed_tokens)
-                if (0 < self.scheduler_config.long_prefill_token_threshold <
-                        num_new_tokens):
-                    num_new_tokens = (
-                        self.scheduler_config.long_prefill_token_threshold)
+                assert (request.num_tokens - request.num_computed_tokens) == 1
                 num_new_tokens = min(num_new_tokens, token_budget)
-                assert num_new_tokens == 1
+                # Make sure the input position does not exceed the max model len.
+                # This is necessary when using spec decoding.
+                num_new_tokens = min(
+                    num_new_tokens,
+                    self.max_model_len - request.num_computed_tokens)
+                # Check that adding the request still respects the max_loras
+                # constraint.
+                if self.lora_config and request.lora_request and (
+                        len(scheduled_loras) == self.lora_config.max_loras
+                        and request.lora_request.lora_int_id
+                        not in scheduled_loras):
+                    # Scheduling would exceed max_loras, skip.
+                    num_new_tokens = 0
+
+                if num_new_tokens == 0:
+                    # The request cannot be scheduled because one of the following
+                    # reason:
+                    # 1. No new tokens to schedule. This may happen when PP>1 and
+                    #    we have already scheduled all prompt tokens but they are
+                    #    not finished yet.
+                    # 2. Adding the request exceeds the max_loras constraint.
+                    # NOTE(woosuk): Here, by doing `continue` instead of `break`,
+                    # we do not strictly follow the FCFS scheduling policy and
+                    # allow the lower-priority requests to be scheduled.
+                    req_index += 1
+                    continue
+
+                num_draft_tokens = max(
+                    num_new_tokens + request.num_computed_tokens -
+                    request.num_tokens, 0)
 
                 while True:
                     new_blocks = self.kv_cache_manager.allocate_slots(
-                        request, num_new_tokens)
+                        request,
+                        num_new_tokens,
+                        num_draft_tokens=num_draft_tokens,
+                        num_lookahead_tokens=self.num_lookahead_tokens)
                     if new_blocks is None:
                         # The request cannot be scheduled.
                         # Preempt the lowest-priority request.
@@ -203,6 +299,10 @@ def skip_cur_request():
                         self.kv_cache_manager.free(preempted_req)
                         preempted_req.status = RequestStatus.PREEMPTED
                         preempted_req.num_computed_tokens = 0
+                        if self.log_stats:
+                            preempted_req.record_event(
+                                EngineCoreEventType.PREEMPTED,
+                                scheduled_timestamp)
                         self.waiting.appendleft(preempted_req)
                         preempted_reqs.append(preempted_req)
                         if preempted_req == request:
@@ -237,6 +337,10 @@ def skip_cur_request():
                         scheduled_spec_decode_tokens[request.request_id] = (
                             request.spec_token_ids)
 
+                # Record scheduled LoRA requests.
+                if self.lora_config and request.lora_request:
+                    scheduled_loras.add(request.lora_request.lora_int_id)
+
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
         assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
@@ -304,6 +408,11 @@ def skip_cur_request():
             meta = self.connector.build_connector_meta(scheduler_output)
             scheduler_output.kv_connector_metadata = meta
 
+        events = self.kv_cache_manager.take_events()
+        if events:
+            batch = KVEventBatch(ts=time.time(), events=events)
+            self.kv_event_publisher.publish(batch)
+
         # Advance the number of computed tokens for the request AFTER
         # the request is scheduled.
         # 1. The scheduler_output of the current step has to include the
@@ -330,14 +439,8 @@ def _check_watermark_for_prefill(self,
                                len(computed_blocks) * self.block_size)
         num_required_blocks = cdiv(num_new_tokens + num_computed_tokens,
                                    self.block_size)
-
-        if vllm_version_is("0.9.0"):
-            req_blocks = self.kv_cache_manager.single_type_manager.req_to_blocks[
-                request.request_id]
-        else:
-            req_blocks = self.kv_cache_manager.coordinator.get_blocks(
-                request.request_id)
-
+        req_blocks = self.kv_cache_manager.coordinator.get_blocks(
+            request.request_id)
         num_new_blocks = (num_required_blocks - len(req_blocks) -
                           len(computed_blocks))
         num_evictable_computed_blocks = sum(1 for blk in computed_blocks
@@ -401,7 +504,8 @@ def update_from_output(
             if num_tokens_scheduled == 0:
                 # The request was not scheduled in this step.
                 continue
-            self.scheduled_req_ids.remove(req_id)
+            if req_id in self.scheduled_req_ids:
+                self.scheduled_req_ids.remove(req_id)
 
         return super().update_from_output(scheduler_output,
                                           model_runner_output)
diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py
index c3de6ae93..9db49cbff 100644
--- a/vllm_ascend/models/deepseek_dbo.py
+++ b/vllm_ascend/models/deepseek_dbo.py
@@ -29,7 +29,7 @@
 
 import torch
 import torch.distributed as dist
-import torch_npu
+import torch_npu  # noqa: F401
 import vllm.envs as envs
 from torch import nn
 from transformers import PretrainedConfig
@@ -40,13 +40,10 @@
                               get_tp_group, tensor_model_parallel_all_reduce)
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.forward_context import get_forward_context
-from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
                                                ReplicatedLinear,
-                                               RowParallelLinear,
-                                               UnquantizedLinearMethod)
+                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -67,6 +64,7 @@
 
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2MLP
 from vllm_ascend.multistream.base import MSEventKey
 from vllm_ascend.multistream.context import (
     advance_step_multistream_layer_context, get_multistream_comm_context,
@@ -78,117 +76,17 @@
                                               make_multistream_metadata_ds)
 from vllm_ascend.multistream.ms_split import compute_split_seq_index
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
-from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
 from vllm_ascend.utils import dispose_tensor
 
 VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
 
 
-class CustomDeepseekDBOMLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
-        reduce_results: bool = True,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-        # NOTE: `torch_npu.npu_dequant_swiglu_quant` can only be enabled in dynamic quant
-        self.is_dynamic_quant = not isinstance(
-            self.gate_up_proj.quant_method,
-            UnquantizedLinearMethod) and isinstance(
-                self.gate_up_proj.quant_method.quant_method,
-                AscendW8A8DynamicLinearMethod)
-
-    def forward(self, x):
-        if self.is_dynamic_quant:
-            x, dynamic_scale = torch_npu.npu_dynamic_quant(x)
-            x = torch_npu.npu_quant_matmul(
-                x,
-                self.gate_up_proj.weight,
-                self.gate_up_proj.weight_scale,
-                output_dtype=torch.int32,
-            )
-            x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
-                x=x,
-                weight_scale=self.gate_up_proj.weight_scale_fp32,
-                activation_scale=dynamic_scale,
-                bias=None,
-                quant_scale=None,
-                quant_offset=None,
-                group_index=None,
-                activate_left=True,
-                quant_mode=1)
-            x = torch_npu.npu_quant_matmul(
-                x,
-                self.down_proj.weight,
-                self.down_proj.weight_scale,
-                pertoken_scale=dynamic_scale,
-                output_dtype=torch.bfloat16,
-            )
-            if self.down_proj.reduce_results and self.down_proj.tp_size > 1:
-                x = tensor_model_parallel_all_reduce(x)
-            return x
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
+class CustomDeepseekDBOMLP(CustomDeepseekV2MLP):
 
     def _forward_ms_mlp(self, x):
         current_ms_metadata = get_multistream_comm_context()
         assert current_ms_metadata is not None
-        if self.is_dynamic_quant:
-            x, dynamic_scale = torch_npu.npu_dynamic_quant(x)
-            x = torch_npu.npu_quant_matmul(
-                x,
-                self.gate_up_proj.weight,
-                self.gate_up_proj.weight_scale,
-                output_dtype=torch.int32,
-            )
-            x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
-                x=x,
-                weight_scale=self.gate_up_proj.weight_scale_fp32,
-                activation_scale=dynamic_scale,
-                bias=None,
-                quant_scale=None,
-                quant_offset=None,
-                group_index=None,
-                activate_left=True,
-                quant_mode=1)
-            x = torch_npu.npu_quant_matmul(
-                x,
-                self.down_proj.weight,
-                self.down_proj.weight_scale,
-                pertoken_scale=dynamic_scale,
-                output_dtype=torch.bfloat16,
-            )
-            if self.down_proj.reduce_results and self.down_proj.tp_size > 1:
-                current_ms_metadata.before_comm_event.record()
-                with torch.npu.stream(current_ms_metadata.comm_stream):
-                    current_ms_metadata.before_comm_event.wait()
-                    x = tensor_model_parallel_all_reduce(x)
-                    current_ms_metadata.after_comm_event.record()
-            return x
         gate_up, _ = self.gate_up_proj(x)
         x = self.act_fn(gate_up)
         current_ms_metadata.before_comm_event.record()
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
index 96c76338b..0ae1142a3 100644
--- a/vllm_ascend/models/deepseek_v2.py
+++ b/vllm_ascend/models/deepseek_v2.py
@@ -25,7 +25,7 @@
 # # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py
 # """Inference-only DeepseekV2/DeepseekV3 model."""
 
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.distributed as dist
@@ -69,12 +69,74 @@
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.distributed.parallel_state import get_ep_group
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
+from vllm_ascend.quantization.quant_config import AscendLinearMethod
 from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
-from vllm_ascend.utils import dispose_tensor
+from vllm_ascend.utils import (dispose_tensor, npu_stream_switch,
+                               npu_wait_tensor)
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
 
 
+class CustomDeepseekV2SiluAndMul(SiluAndMul):
+
+    def __init__(self,
+                 *,
+                 weight_scale: Optional[Callable[[], torch.Tensor]] = None):
+        super().__init__()
+        self.weight_scale = weight_scale
+
+    def forward_oot(self, x: Union[torch.Tensor, Tuple[torch.Tensor,
+                                                       torch.Tensor]]):
+        if isinstance(x, tuple):
+            assert self.weight_scale is not None
+            # For AscendW8A8DynamicLinearMethod:
+            # a dynamic scale is passed along with the quantized value.
+            quantized_x, dynamic_scale = x
+            return torch_npu.npu_dequant_swiglu_quant(
+                x=quantized_x,
+                weight_scale=self.weight_scale(),
+                activation_scale=dynamic_scale,
+                activate_left=True,
+                quant_mode=1)
+        else:
+            return super().forward_oot(x)
+
+
+class CustomDeepseekV2MergedReplicatedLinear(ReplicatedLinear):
+
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: list[int],
+        bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        self.output_sizes = output_sizes
+        super().__init__(input_size,
+                         sum(output_sizes),
+                         bias=bias,
+                         quant_config=quant_config,
+                         prefix=prefix)
+
+    def weight_loader(self, param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor, loaded_shard_id: int):
+        # With no support for GGUF format yet.
+        assert not getattr(param, "is_gguf_weight", False)
+        assert not getattr(param, "is_gguf_weight_type", False)
+
+        assert loaded_shard_id < len(self.output_sizes)
+        shard_offset = sum(self.output_sizes[:loaded_shard_id])
+        shard_size = self.output_sizes[loaded_shard_id]
+        shard = param.data.narrow(param.output_dim, shard_offset, shard_size)
+
+        assert shard.size() == loaded_weight.size(), (
+            f"Tried to load weights of size {loaded_weight.size()}"
+            f"to a parameter shard of id {loaded_shard_id} size {shard.size()}"
+        )
+        shard.copy_(loaded_weight)
+
+
 class CustomDeepseekV2MLP(nn.Module):
 
     def __init__(
@@ -84,61 +146,68 @@ def __init__(
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
         reduce_results: bool = True,
+        force_replicate: bool = False,
         prefix: str = "",
     ) -> None:
         super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
+        if not force_replicate:
+            self.gate_up_proj = MergedColumnParallelLinear(
+                hidden_size, [intermediate_size] * 2,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.gate_up_proj")
+            self.down_proj = RowParallelLinear(intermediate_size,
+                                               hidden_size,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               reduce_results=reduce_results,
+                                               prefix=f"{prefix}.down_proj")
+        else:
+            self.gate_up_proj = CustomDeepseekV2MergedReplicatedLinear(
+                hidden_size, [intermediate_size] * 2,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.gate_up_proj")
+            self.down_proj = ReplicatedLinear(intermediate_size,
+                                              hidden_size,
+                                              bias=False,
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.down_proj")
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
 
-        # NOTE: `torch_npu.npu_dequant_swiglu_quant` can only be enabled in dynamic quant
-        self.is_dynamic_quant = not isinstance(
-            self.gate_up_proj.quant_method,
-            UnquantizedLinearMethod) and isinstance(
-                self.gate_up_proj.quant_method.quant_method,
-                AscendW8A8DynamicLinearMethod)
+        quant_method = self.gate_up_proj.quant_method
+        if isinstance(quant_method, UnquantizedLinearMethod):
+            self.act_fn = CustomDeepseekV2SiluAndMul()
+        elif (isinstance(quant_method, AscendLinearMethod) and isinstance(
+                quant_method.quant_method, AscendW8A8DynamicLinearMethod)):
+            # TODO(sdmyzlp): Currently preserved as before:
+            # 1. The only quantization supported for silu is W8A8Dynamic
+            # 2. Output dtype of gate_up/down is fixed to be int32/bfloat16
+            #
+            # Maybe one can implement a better and more general configuration
+            # scheme, e.g. by somehow passing around the tweaked `quant_config`
+            self.act_fn = CustomDeepseekV2SiluAndMul(
+                # Use lazy binding, for `weight_scale_fp32` is accessible
+                # only after `process_weights_after_loading`.
+                weight_scale=lambda: self.gate_up_proj.weight_scale_fp32)
+            # To be consumed by AscendW8A8DynamicLinearMethod.apply()
+            self.gate_up_proj._ascend_quant_config = {
+                "output_dtype": torch.int32,
+                "pertoken_scale": False,
+                "return_scale": True,
+            }
+            self.down_proj._ascend_quant_config = {
+                "output_dtype": torch.bfloat16,
+                "pertoken_scale": True,
+                "return_scale": False,
+            }
+        else:
+            raise NotImplementedError(
+                f"Quantization with [{type(quant_method)}] is NOT supported")
 
     def forward(self, x):
-        if self.is_dynamic_quant:
-            x, dynamic_scale = torch_npu.npu_dynamic_quant(x)
-            x = torch_npu.npu_quant_matmul(
-                x,
-                self.gate_up_proj.weight,
-                self.gate_up_proj.weight_scale,
-                output_dtype=torch.int32,
-            )
-            x, dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
-                x=x,
-                weight_scale=self.gate_up_proj.weight_scale_fp32,
-                activation_scale=dynamic_scale,
-                bias=None,
-                quant_scale=None,
-                quant_offset=None,
-                group_index=None,
-                activate_left=True,
-                quant_mode=1)
-            x = torch_npu.npu_quant_matmul(
-                x,
-                self.down_proj.weight,
-                self.down_proj.weight_scale,
-                pertoken_scale=dynamic_scale,
-                output_dtype=torch.bfloat16,
-            )
-            if self.down_proj.reduce_results and self.down_proj.tp_size > 1:
-                x = tensor_model_parallel_all_reduce(x)
-            return x
         gate_up, _ = self.gate_up_proj(x)
         x = self.act_fn(gate_up)
         x, _ = self.down_proj(x)
@@ -169,6 +238,12 @@ def __init__(
             raise ValueError(f"Unsupported activation: {config.hidden_act}. "
                              "Only silu is supported for now.")
 
+        ascend_config = get_ascend_config()
+        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on
+        self.enable_multistream_moe = \
+            ascend_config.torchair_graph_config.enable_multistream_moe and VLLM_ENABLE_MC2
+
         self.gate = ReplicatedLinear(config.hidden_size,
                                      config.n_routed_experts,
                                      bias=False,
@@ -204,8 +279,11 @@ def __init__(
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
                 reduce_results=True,
+                force_replicate=self.enable_multistream_moe,
                 prefix=f"{prefix}.shared_experts",
             )
+        else:
+            self.shared_experts = None  # type: ignore
         CustomDeepseekV2MoE.top_k = config.num_experts_per_tok
 
         self.dp_size = get_dp_group().world_size
@@ -216,12 +294,6 @@ def __init__(
 
         self.params_dtype = torch.get_default_dtype()
 
-        ascend_config = get_ascend_config()
-        self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
-        # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on
-        self.enable_multistream_shared_expert = \
-            ascend_config.torchair_graph_config.enable_multistream_shared_expert and VLLM_ENABLE_MC2
-
     def forward(
             self,
             hidden_states: torch.Tensor,
@@ -240,12 +312,10 @@ def forward(
             enable_force_load_balance = False
             if hasattr(attn_metadata, 'with_prefill_across_dp'):
                 is_prefill = is_prefill or attn_metadata.with_prefill_across_dp
-
         num_tokens, hidden_size = hidden_states.shape
-
-        multistream = self.enable_multistream_shared_expert and not is_prefill
-
-        old_hidden_states = hidden_states.clone()
+        old_hidden_states = hidden_states
+        use_separated_shared_experts = (self.shared_experts is not None
+                                        and not self.enable_multistream_moe)
 
         if self.tp_size > 1:
             if (VLLM_ENABLE_MC2
@@ -262,25 +332,22 @@ def forward(
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
 
-        kwargs = {}
-        if multistream:
-            kwargs.update({
-                "shared_experts": self.shared_experts,
-                "shared_hidden_states": old_hidden_states
-            })
-
-        hidden_states = self.experts(
+        experts_hidden_states = self.experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
             is_prefill=is_prefill,
             top_k=CustomDeepseekV2MoE.top_k,
             enable_force_load_balance=enable_force_load_balance,
-            **kwargs)
-
-        if multistream:
-            hidden_states, shared_output = hidden_states
+            shared_experts=(self.shared_experts
+                            if not use_separated_shared_experts else None),
+        )
 
-        hidden_states = hidden_states * self.routed_scaling_factor
+        if not isinstance(experts_hidden_states, tuple):
+            hidden_states = experts_hidden_states * self.routed_scaling_factor
+        else:
+            hidden_states = (
+                experts_hidden_states[0] * self.routed_scaling_factor +
+                experts_hidden_states[1])
 
         if self.tp_size > 1:
             if (VLLM_ENABLE_MC2
@@ -294,12 +361,9 @@ def forward(
             else:
                 hidden_states = tensor_model_parallel_all_reduce(hidden_states)
 
-        if self.n_shared_experts is not None:
-            if not multistream:
-                shared_output = self.shared_experts(old_hidden_states)
-
-        if shared_output is not None:
-            hidden_states = hidden_states + shared_output
+        if use_separated_shared_experts:
+            hidden_states = hidden_states + self.shared_experts(
+                old_hidden_states)
 
         return hidden_states.view(num_tokens, hidden_size)
 
@@ -433,6 +497,8 @@ def __init__(
 
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.enable_multistream_mla = \
+            ascend_config.torchair_graph_config.enable_multistream_mla
 
     def forward(
             self,
@@ -442,7 +508,14 @@ def forward(
             attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor:
         if self.q_lora_rank is not None:
             ckq = self.q_a_proj(hidden_states)[0]
-            hidden_states_or_q_c = self.q_a_layernorm(ckq)
+            use_multistream_mla = (self.enable_multistream_mla
+                                   and attn_metadata is not None
+                                   and attn_metadata.num_decodes > 0)
+            npu_wait_tensor(hidden_states, ckq, enabled=use_multistream_mla)
+            with npu_stream_switch("mla_secondary",
+                                   0,
+                                   enabled=use_multistream_mla):
+                hidden_states_or_q_c = self.q_a_layernorm(ckq)
         else:
             hidden_states_or_q_c = hidden_states
         if self.torchair_graph_enabled:
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
index 25f3b05d5..d6115d35c 100644
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -16,7 +16,7 @@
 # Adapted from vllm/tests/kernels/test_moe.py
 
 import os
-from typing import Callable, List, Optional
+from typing import Any, Callable, List, Optional, Tuple, Union
 
 import torch
 import torch.distributed as dist
@@ -36,6 +36,7 @@
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
+from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
 USING_LCCL_COM: bool = envs_ascend.USING_LCCL_COM
@@ -106,15 +107,17 @@ def process_topk_ids(topk_ids: torch.Tensor, expert_num: int, ep_size: int,
     return topk_ids_pad, unpad_indices
 
 
-def fused_experts_with_mc2(hidden_states: torch.Tensor,
-                           w1: torch.Tensor,
-                           w2: torch.Tensor,
-                           topk_weights: torch.Tensor,
-                           topk_ids: torch.Tensor,
-                           top_k: int,
-                           expert_map: torch.Tensor = None,
-                           moe_all_to_all_group_name: Optional[str] = None,
-                           **kwargs) -> torch.Tensor:
+def fused_experts_with_mc2(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    expert_map: torch.Tensor = None,
+    moe_all_to_all_group_name: Optional[str] = None,
+    shared_experts: Optional[Any] = None
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
     global_bs = 0
     moe_expert_num = len(expert_map)
     kwargs_mc2 = {
@@ -154,6 +157,13 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
     expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
         0:5]
 
+    if shared_experts is not None:
+        with npu_stream_switch("moe_secondary", 0):
+            npu_wait_tensor(hidden_states, topk_weights)
+            shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states)
+            npu_wait_tensor(shared_gate_up, expand_x)
+            shared_act = shared_experts.act_fn(shared_gate_up)
+
     w1 = w1.transpose(1, 2)
 
     group_list = expert_token_nums.to(torch.int64)
@@ -210,7 +220,13 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
 
     hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2)
 
-    return hidden_states
+    if shared_experts is None:
+        return hidden_states
+    else:
+        with npu_stream_switch("moe_secondary", 0):
+            npu_wait_tensor(shared_act, down_out_list)
+            shared_hidden_states, _ = shared_experts.down_proj(shared_act)
+        return hidden_states, shared_hidden_states
 
 
 def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
@@ -875,6 +891,7 @@ def apply(
         e_score_correction_bias: Optional[torch.Tensor] = None,
         is_prefill: bool = False,
         enable_force_load_balance: bool = False,
+        shared_experts: Optional[Any] = None,
         **kwargs,
     ) -> torch.Tensor:
 
@@ -924,7 +941,7 @@ def apply(
                 top_k=top_k,
                 expert_map=expert_map,
                 moe_all_to_all_group_name=self.moe_all_to_all_group_name,
-                **kwargs)
+                shared_experts=shared_experts)
         elif self.torchair_graph_enabled or get_ep_group().world_size == 1:
             return fused_experts(hidden_states=x,
                                  w1=layer.w13_weight,
@@ -1053,9 +1070,6 @@ def __init__(
         self.moe_parallel_config.ep_rank = get_ep_group().rank_in_group
 
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
-        # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on
-        self.enable_multistream_shared_expert = \
-            ascend_config.torchair_graph_config.enable_multistream_shared_expert and VLLM_ENABLE_MC2
 
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
@@ -1102,8 +1116,8 @@ def forward(self,
                 router_logits: torch.Tensor,
                 is_prefill: bool,
                 enable_force_load_balance: bool = False,
-                top_k=None,
-                **kwargs):
+                top_k: Optional[int] = None,
+                shared_experts: Optional[Any] = None):
         assert self.quant_method is not None
 
         if top_k:
@@ -1132,7 +1146,7 @@ def forward(self,
                         hidden_states, router_logits)
 
         # Matrix multiply.
-        hidden_states = self.quant_method.apply(
+        e_hidden_states = self.quant_method.apply(
             layer=self,
             x=hidden_states,
             router_logits=router_logits,
@@ -1150,36 +1164,39 @@ def forward(self,
             enable_force_load_balance=enable_force_load_balance,
             log2phy=self.log2phy,
             global_redundant_expert_num=self.global_redundant_expert_num,
-            **kwargs)
+            shared_experts=shared_experts,
+        )
 
-        if self.enable_multistream_shared_expert and not is_prefill:
-            hidden_states, shared_output = hidden_states
+        if shared_experts is not None:
+            # Provide dummy implementation of "non-separated" shared experts.
+            if not isinstance(e_hidden_states, tuple):
+                return e_hidden_states, shared_experts(hidden_states)
+            else:
+                return e_hidden_states
 
         if self.dp_size > 1:
             if VLLM_ENABLE_MC2 and not is_prefill:
                 ...
             elif self.torchair_graph_enabled:
                 if USING_LCCL_COM:  # type: ignore
-                    hidden_states = dist._functional_collectives.reduce_scatter_tensor(
-                        hidden_states,
+                    e_hidden_states = dist._functional_collectives.reduce_scatter_tensor(
+                        e_hidden_states,
                         "sum",
                         scatter_dim=0,
                         group=get_dp_group().device_group)
                 elif self.torchair_graph_enabled and not is_prefill:
-                    hidden_states = dist._functional_collectives.reduce_scatter_tensor(
-                        hidden_states,
+                    e_hidden_states = dist._functional_collectives.reduce_scatter_tensor(
+                        e_hidden_states,
                         "sum",
                         scatter_dim=0,
                         group=get_dp_group().device_group)
                 else:
-                    hidden_states = get_ep_group().combine(hidden_states)
+                    e_hidden_states = get_ep_group().combine(e_hidden_states)
 
         if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
-            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+            e_hidden_states = tensor_model_parallel_all_reduce(e_hidden_states)
 
-        if self.enable_multistream_shared_expert and not is_prefill:
-            return hidden_states, shared_output
-        return hidden_states
+        return e_hidden_states
 
     # ----------------------------------------- TBO-related --------------------------------------------
 
diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py
index 0c2a00afb..9f8ae784c 100644
--- a/vllm_ascend/ops/rotary_embedding.py
+++ b/vllm_ascend/ops/rotary_embedding.py
@@ -22,11 +22,12 @@
 from vllm.model_executor.layers.rotary_embedding import (
     DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 
-from vllm_ascend.platform import CUSTOM_OP_ENABLED
+from vllm_ascend.utils import enable_custom_op
 
 
 def custom_rotary_embedding_enabled(query, neox_style, head_size):
-    return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and CUSTOM_OP_ENABLED
+    return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and enable_custom_op(
+    )
 
 
 def rope_forward_oot(
diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py
index ccf9bd9e0..3c24bfc70 100644
--- a/vllm_ascend/patch/__init__.py
+++ b/vllm_ascend/patch/__init__.py
@@ -24,9 +24,9 @@
 #           each worker's `__init__` function.
 #
 # Then in each kind of patch, there are three folders:
-# - patch_0_9_0: contains the patches applied when vllm version is 0.9.0.
+# - patch_0_9_1: contains the patches applied when vllm version is 0.9.1.
 # - patch_main: contains the patches applied when vllm version is main branch.
-# - patch_common: contains the patches applied in both 0.9.0 and main branch.
+# - patch_common: contains the patches applied in both 0.9.1 and main branch.
 #
 # Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
 # ----------------------------------------------------------------------------------
@@ -44,56 +44,40 @@
 #       platform owned `CoordinatorGroup` to make sure all the CoordinateGroup can be properly destroyed
 #    How：
 #       Call `vllm_ascend.distributed.parallel_state method `destroy_platform_model_parallel` to destroy all the `CoordinateGroup`
-#    Related PR (if no, explain why): no related PR, we want add this ability into vllm
+#    Related PR (if no, explain why):
 #    Future Plan:
 #       Remove those patch when vllm merged them
-#   2. `vllm.distributed.stateless_init_torch_distributed_process_group()`
+#   2. `vllm.v1.engine.core.DPEngineCoreProc._init_data_parallel`
 #    Why:
-#       The stateless process group can not be initialized except from gloo and nccl backend, vllm-ascend
-#       needs to initialize its own stateless process group for communication, so we add the platform related
-#       call to the `stateless_init_torch_distributed_process_group`, to enable other platform which may support
-#       stateless process group initialize method
+#       There is some bug for ASCEND_RT_VISIBLE_DEVICES usage.
 #    How：
-#       rewrite stateless_init_torch_distributed_process_group to judge if there is a stateless process group initialize
-#       method and call platform method `platform_register_backend` to initialize them
-#    Related PR (if no, explain why): no related PR, we want add this ability into vllm
+#       The ASCEND_RT_VISIBLE_DEVICES related code is dropped.
+#    Related PR (if no, explain why):
+#       No, this is a bug for vllm ascend
 #    Future Plan:
-#       Remove those patch when vllm merged them
-#   3. `ParallelConfig.get_next_dp_init_port`
+#       Remove this patch once ASCEND_RT_VISIBLE_DEVICES bug is fixed.
+#   3. `vllm.config.ParallelConfig.get_next_dp_init_port`
 #    Why:
-#       We want to get dp port from env variable, so the multi-node inference can be properly initialized and run.
+#       vllm doesn't support get port from environment.
 #    How：
-#       Get the dp port from env variable enable multi-mode dp inference
-#    Related PR (if no, explain why): no related PR, we want add this ability into vllm
+#       Add the logic to get port from environment.
+#    Related PR (if no, explain why):
+#       Need a PR to vllm to support get port from environment.
 #    Future Plan:
-#       Its a workaround in vllm-ascend to enable multi-node dp inference, maybe removed if vllm have better plan
-#       on multi-node dp inference implementation
-#   4. `ParallelConfig.stateless_init_dp_group`
+#       Remove those patch when vllm merged them
+#   4. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group`
 #    Why:
 #       vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to
 #       get better performance
 #    How：
-#       adopt nccl backend to init process group
-#    Related PR (if no, explain why): no related PR, we want add this ability into vllm
+#       adopt nccl backend to init process group.(Now we still use gloo, it's just a placeholder, we'll use nccl in the future)
+#    Related PR (if no, explain why):
+#       Need a PR to vllm to support more backend.
 #    Future Plan:
-#       Remove those patch when vllm merged them
-#
+#       Remove those patch when vllm support more backend.
 #
 # * Worker Patch:
 # ===============
-# ** File: worker/patch_common/patch_metrics.py **
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.spec_decode.metrics.AsyncMetricsCollector.maybe_collect_rejsample_metrics`
-#    Why:
-#       There are cuda hard code (current_platform.is_cuda_alike()) in
-#       `AsyncMetricsCollector.maybe_collect_rejsample_metrics`
-#    How：
-#       Change to use `current_platform.Event` to determine whether to return None
-#    Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
-#       https://github.com/vllm-project/vllm/pull/14411
-#    Future Plan:
-#       Revert it when the related pr is merged in vllm.
-#
 # ** File: worker/patch_common/patch_minicpm.py **
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.model_executor.models.minicpm.MiniCPMAttention.forward`
@@ -103,7 +87,7 @@
 #       However float32 is not supported in cann rope op, thus we keep this patch
 #    How：
 #       Removed the dtype convert operations in forward
-#    Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
+#    Related PR (if no, explain why):
 #       NO, only for npu due to rope op.
 #    Future Plan:
 #       Keep this patch in vllm-ascend.
@@ -119,7 +103,7 @@
 #       - support attention metadata register to the set supported spec decode
 #       - offer a api in platform to determine whether spec decode is supported,
 #         and deprecate is_cuda_alike in it.
-#    Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
+#    Related PR (if no, explain why):
 #       - https://github.com/vllm-project/vllm/pull/15195
 #       - https://github.com/vllm-project/vllm-ascend/pull/395
 #    Future Plan:
@@ -131,14 +115,14 @@
 #       vLLM `Remove Sampler from Model Code` so vllm-ascend needs adapt to this change.
 #    How：
 #       Use vLLM 0.8.4 method to patch it.
-#    Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
+#    Related PR (if no, explain why):
 #       - https://github.com/vllm-project/vllm/pull/15195
 #       - https://github.com/vllm-project/vllm-ascend/pull/395
 #    Future Plan:
 #       Remove it when we identify the reasons clearly.
 #
 # ** File: worker/patch_common/patch_spec_decode_worker.py **
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.spec_decode.spec_decode_worker.SpecDecodeWorker.create_worker`
 #    Why:
 #       We need to use the patched `TP1DraftModelRunner` in `SpecDecodeWorker.create_worker`.
@@ -146,14 +130,14 @@
 #           `FlashAttentionMetadata`
 #    How：
 #       ditto
-#    Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
+#    Related PR (if no, explain why):
 #       - https://github.com/vllm-project/vllm/pull/15195
 #       - https://github.com/vllm-project/vllm-ascend/pull/395
 #    Future Plan:
 #       Revert it when the related pr is merged in vllm and vllm-ascend.
 #
 # ** File: worker/patch_common/patch_eagle.py **
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.spec_decode.eagle.prepare_inputs`
 #    Why:
 #       We need to use the patched `prepare_input_kernel` in `eagle.prepare_inputs`.
@@ -161,12 +145,12 @@
 #       kernel, ascend is now not support triton kernel.
 #    How：
 #       Re-implementation the `prepare_input_kernel` triton kernel by pytorch
-#    Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
+#    Related PR (if no, explain why):
 #       - Ascend doesn't support triton
 #    Future Plan:
 #       Revert it when the ascend support triton kernel.
 #
-# ** File: v1/sample/sampler.py **
+# ** File: worker/patch_common/patch_sampler.py **
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.sample.sampler.Sampler.apply_top_k_top_p`
 #    Why:
@@ -175,21 +159,44 @@
 #       to improve performance.
 #    How：
 #       Re-implementation the `apply_top_k_top_p` function by pytorch
-#    Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
+#    Related PR (if no, explain why):
 #       - https://github.com/vllm-project/vllm-ascend/pull/970
 #    Future Plan:
 #       Revert it when the ascend scatter performance improves.
 #
-# ** File: v1/sample/sampler.py **
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~s
-#   1. `vllm.v1.sample.sampler.Sampler.apply_min_p`
+#   2. `vllm.v1.sample.sampler.Sampler.apply_min_p`
 #    Why:
 #       We need to use the patched `apply_min_p` in `sample`.
 #       The mainly reason to overwrite `apply_min_p` is
 #       to improve performance.
 #    How：
 #       Re-implementation the `apply_min_p` function by pytorch
-#    Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit....
+#    Related PR (if no, explain why):
 #       - https://github.com/vllm-project/vllm-ascend/pull/970
 #    Future Plan:
 #       Revert it when the ascend indexput performance improves.
+#
+# ** File: worker/patch_common/patch_distributed.py **
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.distributed.parallel_state.GroupCoordinator`
+#    Why:
+#       vllm doesn't support all_to_all for GroupCoordinator.
+#    How：
+#       Add all_to_all implementation for GroupCoordinator.
+#    Related PR (if no, explain why):
+#       Need a PR to vllm to support all_to_all for GroupCoordinator.
+#    Future Plan:
+#       Remove this patch when vllm merged them.
+#
+# ** File: worker/patch_common/patch_utils.py **
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#   1. `vllm.utils.direct_register_custom_op`
+#    Why:
+#       pytorch 2.7.o is not compatible with pytorch 2.5.1. While vllm is based on pytorch 2.7.0, but vllm ascend
+#       is based on pytorch 2.5.1, so we need to use this patch to make vllm compatible with pytorch 2.5.1.
+#    How：
+#       patch __annotations__ check to make it compatible with pytorch 2.5.1.
+#    Related PR (if no, explain why):
+#       This is the problem in vllm-ascend
+#    Future Plan:
+#       Remove this patch once pytorch 2.7.0 is supported for vllm ascend.
diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py
index e724fe531..4ec38e38d 100644
--- a/vllm_ascend/patch/platform/__init__.py
+++ b/vllm_ascend/patch/platform/__init__.py
@@ -17,8 +17,8 @@
 from vllm_ascend.utils import vllm_version_is
 
 # Import specific patches for different versions
-if vllm_version_is("0.9.0"):
-    from vllm_ascend.patch.platform import patch_0_9_0  # noqa: F401
+if vllm_version_is("0.9.1"):
+    from vllm_ascend.patch.platform import patch_0_9_1  # noqa: F401
     from vllm_ascend.patch.platform import patch_common  # noqa: F401
 else:
     from vllm_ascend.patch.platform import patch_common  # noqa: F401
diff --git a/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py b/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py
deleted file mode 100644
index d468326bd..000000000
--- a/vllm_ascend/patch/platform/patch_0_9_0/patch_distributed.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import torch
-from torch.distributed import ProcessGroup
-from torch.distributed.distributed_c10d import (Backend, PrefixStore,
-                                                _get_default_timeout,
-                                                is_nccl_available)
-from torch.distributed.rendezvous import rendezvous
-from vllm.distributed import utils
-
-
-def stateless_init_torch_distributed_process_group(
-        host: str, port: int, rank: int, world_size: int,
-        backend: str) -> ProcessGroup:
-    """
-    A replacement for `torch.distributed.init_process_group` that does not
-    pollute the global state. The created ProcessGroup object can be used for
-    some operations such as `allreduce`, because it does not depend on the
-    global rank. However, some operations such as `broadcast` cannot be used
-    because it depends on the global rank.
-
-    # TODO: ask for help from PyTorch team if we need the `broadcast` operation.
-
-    This function is useful when we are not sure about the total number of
-    processes in the process group. For example, we may have process
-    1, 2, ..., 8 who want to communicate, and process 9 might be the same
-    process as process 1, or it might be a different process; process 10
-    might be the same process as process 5, or it might be a different process.
-    In this case, how can we reliably form a communication channel within
-    process 9 and 10, without affecting the communication channel within
-    process 1, 2, ..., 8?
-
-    One possible solution is to figure out if process 9 and 10 are the same
-    as process 1 and 5 beforehand, and then form a communication channel
-    based on the information, adjusting the ranks and world_size etc. However,
-    figuring out the information is not always easy, and it will interfere
-    with the main communication channel.
-
-    Our solution is to always form a communication channel with process 1, 2,
-    ..., 8, and then use this function to form another communication channel
-    with process 9 and 10. This way, regardless of whether process 9 and 10
-    are the same as process 1 and 5, the main communication channel is
-    always formed with process 1, 2, ..., 8, and the additional communication
-    channel is formed with process 9 and 10.
-    """
-    init_method = f"tcp://{host}:{port}"
-    backend = Backend(backend)  # it is basically string
-    timeout = _get_default_timeout(backend)
-
-    store, rank, world_size = next(
-        rendezvous(init_method, rank, world_size, timeout=timeout))
-    store.set_timeout(timeout)
-
-    group_rank = rank
-    group_size = world_size
-
-    # Use a PrefixStore to avoid accidental overrides of keys used by
-    # different systems (e.g. RPC) in case the store is multi-tenant.
-    prefix_store = PrefixStore(init_method, store)
-
-    # TODO(Yizhou): The reason we need to set options while vllm does not
-    # seems to be related to the version of PyTorch. In the latest version,
-    # there is no need to set options. While in the older version, 2.5.1
-    # specifically, we need to set options.
-    options = ProcessGroup.Options(backend=backend)
-    pg: ProcessGroup = ProcessGroup(
-        prefix_store,
-        group_rank,
-        group_size,
-        options,
-    )
-    if backend == "gloo":
-        from torch.distributed.distributed_c10d import ProcessGroupGloo
-        backend_class = ProcessGroupGloo(prefix_store,
-                                         group_rank,
-                                         group_size,
-                                         timeout=timeout)
-        backend_type = ProcessGroup.BackendType.GLOO
-        device = torch.device("cpu")
-    elif backend == "nccl":
-        assert is_nccl_available()
-        from torch.distributed.distributed_c10d import ProcessGroupNCCL
-
-        backend_options = ProcessGroupNCCL.Options()
-        backend_options._timeout = timeout
-
-        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
-                                         backend_options)
-        backend_type = ProcessGroup.BackendType.NCCL
-        device = torch.device("cuda")
-    elif backend == "hccl":
-        from torch.distributed import is_hccl_available
-        assert is_hccl_available()
-        from torch_npu._C._distributed_c10d import ProcessGroupHCCL
-        backend_options = ProcessGroupHCCL.Options()
-        backend_options._timeout = timeout
-        backend_class = ProcessGroupHCCL(prefix_store, group_rank, group_size,
-                                         backend_options)
-        device = torch.device("npu")
-        backend_class._set_sequence_number_for_group()
-        backend_type = ProcessGroup.BackendType.CUSTOM
-        pg._register_backend(device, backend_type, backend_class)
-        return pg
-    else:
-        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
-
-    # TODO(Yizhou): Like we mentioned above, _set_default_backend is not
-    # implemented in the 2.5.1 version of PyTorch. But we need to set it
-    # after the latest version is released.
-    # pg._set_default_backend(backend_type)
-    backend_class._set_sequence_number_for_group()
-
-    pg._register_backend(device, backend_type, backend_class)
-
-    return pg
-
-
-utils.stateless_init_torch_distributed_process_group = stateless_init_torch_distributed_process_group
diff --git a/vllm_ascend/patch/worker/patch_0_9_0/__init__.py b/vllm_ascend/patch/platform/patch_0_9_1/__init__.py
similarity index 100%
rename from vllm_ascend/patch/worker/patch_0_9_0/__init__.py
rename to vllm_ascend/patch/platform/patch_0_9_1/__init__.py
diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py
index d1d3d42f4..3b29856d2 100644
--- a/vllm_ascend/patch/worker/__init__.py
+++ b/vllm_ascend/patch/worker/__init__.py
@@ -18,8 +18,8 @@
 from vllm_ascend.utils import vllm_version_is
 
 # Import specific patches for different versions
-if vllm_version_is("0.9.0"):
-    from vllm_ascend.patch.worker import patch_0_9_0  # noqa: F401
+if vllm_version_is("0.9.1"):
+    from vllm_ascend.patch.worker import patch_0_9_1  # noqa: F401
     from vllm_ascend.patch.worker import patch_common  # noqa: F401
 else:
     from vllm_ascend.patch.worker import patch_common  # noqa: F401
diff --git a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py b/vllm_ascend/patch/worker/patch_0_9_1/__init__.py
similarity index 90%
rename from vllm_ascend/patch/platform/patch_0_9_0/__init__.py
rename to vllm_ascend/patch/worker/patch_0_9_1/__init__.py
index f0ac16236..116c73c06 100644
--- a/vllm_ascend/patch/platform/patch_0_9_0/__init__.py
+++ b/vllm_ascend/patch/worker/patch_0_9_1/__init__.py
@@ -14,4 +14,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import vllm_ascend.patch.platform.patch_0_9_0.patch_distributed  # noqa
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
index 912e375f7..80211ad13 100644
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -16,7 +16,6 @@
 #
 
 import gc
-import logging
 import os
 from datetime import timedelta
 from typing import TYPE_CHECKING, Optional, Tuple
@@ -32,16 +31,6 @@
 from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config
 from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
 
-CUSTOM_OP_ENABLED = False
-try:
-    # register custom ops into torch_library here
-    import vllm_ascend.vllm_ascend_C  # type: ignore  # noqa: F401
-    CUSTOM_OP_ENABLED = True
-except ImportError as e:
-    logging.warning(
-        "Failed to import 'vllm_ascend.vllm_ascend_C': %s. All custom ops will be disabled. ",
-        e)
-
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
     from vllm.utils import FlexibleArgumentParser
@@ -50,7 +39,6 @@
     VllmConfig = None
     FlexibleArgumentParser = None
 
-os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1"
 os.environ["ACL_OP_INIT_MODE"] = ascend_envs.VLLM_ASCEND_ACL_OP_INIT_MODE
 
 
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
index c6e863ff7..66a0a302c 100644
--- a/vllm_ascend/quantization/w8a8_dynamic.py
+++ b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -15,19 +15,19 @@
 # limitations under the License.
 #
 
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 import torch
 import torch.distributed as dist
 import torch_npu
-import torchair as tng  # type: ignore
-from vllm.distributed import GroupCoordinator, tensor_model_parallel_all_reduce
+from vllm.distributed import GroupCoordinator
 
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.distributed.parallel_state import get_ep_group
 from vllm_ascend.ops.fused_moe import select_experts
-from vllm_ascend.utils import dispose_tensor
+from vllm_ascend.utils import (dispose_tensor, npu_stream_switch,
+                               npu_wait_tensor)
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
 
@@ -39,8 +39,7 @@ def apply_mlp(hidden_states: torch.Tensor,
               w2_scale: torch.Tensor,
               group_list: torch.Tensor,
               dynamic_scale: torch.Tensor = None,
-              group_list_type: int = 1,
-              **kwargs) -> torch.Tensor:
+              group_list_type: int = 1) -> torch.Tensor:
     """
     apply MLP: gate_up_proj -> swiglu -> down_proj
 
@@ -74,23 +73,6 @@ def apply_mlp(hidden_states: torch.Tensor,
     else:
         pertoken_scale = dynamic_scale
 
-    shared_experts = kwargs.get('shared_experts', None)
-    if shared_experts:
-        shared_gate_up = kwargs.get('shared_gate_up', None)
-        shared_dynamic_scale = kwargs.get('shared_dynamic_scale', None)
-        with tng.scope.npu_stream_switch('cv'):
-            tng.scope.npu_wait_tensor(shared_gate_up, hidden_states)
-            shared_x, shared_dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
-                x=shared_gate_up,
-                weight_scale=shared_experts.gate_up_proj.weight_scale_fp32,
-                activation_scale=shared_dynamic_scale,
-                bias=None,
-                quant_scale=None,
-                quant_offset=None,
-                group_index=None,
-                activate_left=True,
-                quant_mode=1)
-
     # gmm1: gate_up_proj
     hidden_states = torch_npu.npu_grouped_matmul(
         x=[hidden_states],
@@ -120,36 +102,24 @@ def apply_mlp(hidden_states: torch.Tensor,
         group_list=group_list,
         output_dtype=w2_scale.dtype)[0]
 
-    if shared_experts:
-        with tng.scope.npu_stream_switch('cv'):
-            tng.scope.npu_wait_tensor(shared_x, hidden_states)
-            shared_output = torch_npu.npu_quant_matmul(
-                shared_x,
-                shared_experts.down_proj.weight,
-                shared_experts.down_proj.weight_scale,
-                pertoken_scale=shared_dynamic_scale,
-                output_dtype=torch.bfloat16,
-            )
-            if shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1:
-                shared_output = tensor_model_parallel_all_reduce(shared_output)
-    if shared_experts:
-        return hidden_states, shared_output
     return hidden_states
 
 
-def fused_experts_with_mc2(hidden_states: torch.Tensor,
-                           w1: torch.Tensor,
-                           w2: torch.Tensor,
-                           w1_scale: torch.Tensor,
-                           w2_scale: torch.Tensor,
-                           topk_weights: torch.Tensor,
-                           topk_ids: torch.Tensor,
-                           top_k: int,
-                           expert_map: torch.Tensor = None,
-                           moe_all_to_all_group_name: str = "",
-                           log2phy: torch.Tensor = None,
-                           global_redundant_expert_num: int = 0,
-                           **kwargs) -> torch.Tensor:
+def fused_experts_with_mc2(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    top_k: int,
+    expert_map: torch.Tensor = None,
+    moe_all_to_all_group_name: str = "",
+    log2phy: torch.Tensor = None,
+    global_redundant_expert_num: int = 0,
+    shared_experts: Optional[Any] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
     if log2phy:
         topk_ids = log2phy[topk_ids]
     global_bs = 0
@@ -188,31 +158,17 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
     }
     kwargs_mc2.update(stage1_kwargs)
 
-    shared_experts = kwargs.get('shared_experts', None)
-    if shared_experts:
-        shared_hidden_states = kwargs.get('shared_hidden_states', None)
-        with tng.scope.npu_stream_switch('cv'):
-            tng.scope.npu_wait_tensor(shared_hidden_states, hidden_states)
-            shared_x, shared_dynamic_scale = torch_npu.npu_dynamic_quant(
-                shared_hidden_states)
-            shared_gate_up = torch_npu.npu_quant_matmul(
-                shared_x,
-                shared_experts.gate_up_proj.weight,
-                shared_experts.gate_up_proj.weight_scale,
-                output_dtype=torch.int32,
-            )
-        kwargs.update({
-            "shared_gate_up": shared_gate_up,
-            "shared_dynamic_scale": shared_dynamic_scale,
-        })
-
     output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2)
     # comm_stream.wait_stream(torch.npu.current_stream())
     expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
         0:5]
 
-    if quant_mode == 0:
-        dynamic_scale = None
+    if shared_experts is not None:
+        with npu_stream_switch("moe_secondary", 0):
+            npu_wait_tensor(hidden_states, topk_weights)
+            shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states)
+            npu_wait_tensor(shared_gate_up[0], expand_x)
+            shared_act = shared_experts.act_fn(shared_gate_up)
 
     # `expand_x` will be disposed in the `apply_mlp` function
     down_out_list = apply_mlp(expand_x,
@@ -221,12 +177,7 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
                               w2,
                               w2_scale,
                               expert_token_nums,
-                              dynamic_scale=dynamic_scale,
-                              **kwargs)
-
-    multi_stream = isinstance(down_out_list, tuple)
-    if multi_stream:
-        down_out_list, shared_output = down_out_list
+                              dynamic_scale=dynamic_scale)
 
     # moeCombine
     kwargs_mc2 = {
@@ -257,9 +208,13 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
 
     hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2)
 
-    if multi_stream:
+    if shared_experts is None:
+        return hidden_states
+    else:
+        with npu_stream_switch("moe_secondary", 0):
+            npu_wait_tensor(shared_act[0], down_out_list)
+            shared_output, _ = shared_experts.down_proj(shared_act)
         return hidden_states, shared_output
-    return hidden_states
 
 
 # currently expert parallelism implemented with all2all
@@ -541,21 +496,33 @@ def get_perchannel_param(
     @staticmethod
     def apply(
         layer: torch.nn.Module,
-        x: torch.Tensor,
+        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         bias: Optional[torch.Tensor] = None,
         tp_rank: Optional[int] = 0,
     ) -> torch.Tensor:
-        original_dtype = x.dtype
-        # use ATB quantize
-        quant_out, dynamic_scale = torch_npu.npu_dynamic_quant(x)
-        return torch_npu.npu_quant_matmul(
-            quant_out,
+        config = getattr(layer, "_ascend_quant_config", {})
+        if not isinstance(x, tuple):
+            output_dtype = config.get("output_dtype", x.dtype)
+            quantized_x, dynamic_scale = torch_npu.npu_dynamic_quant(x)
+        else:
+            assert "output_dtype" in config.keys(), (
+                f"DynamicLinearMethod needs explicitly specified `output_dtype`"
+                f"for pre-quantized input, got config [{config}]")
+            output_dtype = config["output_dtype"]
+            quantized_x, dynamic_scale = x
+        pertoken_scale = (dynamic_scale
+                          if config.get("pertoken_scale", True) else None)
+
+        output = torch_npu.npu_quant_matmul(
+            quantized_x,
             layer.weight,
             layer.weight_scale,
-            pertoken_scale=dynamic_scale,
+            pertoken_scale=pertoken_scale,
             bias=bias,
-            output_dtype=original_dtype,
+            output_dtype=output_dtype,
         )
+        return ((output, dynamic_scale)
+                if config.get("return_scale", False) else output)
 
     def process_weights_after_loading(self, layer):
         if self.transpose_weight:
@@ -650,6 +617,7 @@ def apply(
         enable_force_load_balance: bool = True,
         log2phy: torch.Tensor = None,
         global_redundant_expert_num: int = 0,
+        shared_experts: Optional[Any] = None,
         **kwargs,
     ) -> torch.Tensor:
         assert router_logits.shape[
@@ -706,7 +674,7 @@ def apply(
                 moe_all_to_all_group_name=self.moe_all_to_all_group_name,
                 log2phy=log2phy,
                 global_redundant_expert_num=global_redundant_expert_num,
-                **kwargs)
+                shared_experts=shared_experts)
         elif self.torchair_graph_enabled or self.ep_group.world_size == 1:
             return fused_experts(hidden_states=x,
                                  w1=layer.w13_weight,
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
index 7d4093804..6c2c6c608 100644
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -19,17 +19,26 @@
 
 import atexit
 import math
-from contextlib import contextmanager
+from contextlib import contextmanager, nullcontext
 from threading import Lock
 from typing import TYPE_CHECKING, List, Tuple
 
 import torch
+import torchair  # type: ignore[import]  # noqa: F401
 from packaging.version import InvalidVersion, Version
 from torch_npu.npu.streams import Event
 from vllm.logger import logger
 
 import vllm_ascend.envs as envs
 
+try:
+    # Recent release of torchair has moved these ops to `.scope`.
+    from torchair.scope import npu_stream_switch as _npu_stream_switch
+    from torchair.scope import npu_wait_tensor as _npu_wait_tensor
+except ImportError:
+    from torchair.ops import NpuStreamSwitch as _npu_stream_switch
+    from torchair.ops import npu_wait_tensor as _npu_wait_tensor
+
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 else:
@@ -44,6 +53,8 @@
 
 ASCEND_QUATIZATION_METHOD = "ascend"
 
+CUSTOM_OP_ENABLED = None
+
 
 def try_register_lib(lib_name: str, lib_info: str = ""):
     import importlib
@@ -58,6 +69,31 @@ def try_register_lib(lib_name: str, lib_info: str = ""):
         pass
 
 
+def enable_custom_op():
+    """
+    Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component. 
+    Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device().
+    """
+    global CUSTOM_OP_ENABLED
+
+    if CUSTOM_OP_ENABLED is not None:
+        return CUSTOM_OP_ENABLED
+
+    else:
+        try:
+            # register custom ops into torch_library here
+            import vllm_ascend.vllm_ascend_C  # type: ignore  # noqa: F401
+            CUSTOM_OP_ENABLED = True
+
+        except ImportError:
+            CUSTOM_OP_ENABLED = False
+            logger.warning(
+                "Warning: Failed to register custom ops, all custom ops will be disabled"
+            )
+
+        return CUSTOM_OP_ENABLED
+
+
 def find_hccl_library() -> str:
     """
     We either use the library file specified by the `HCCL_SO_PATH`
@@ -227,3 +263,14 @@ def pop_captured_sync(self) -> dict:
             durations[tag] = observe_start.elapsed_time(observe_end)
 
         return durations
+
+
+def npu_stream_switch(tag: str, priority: int, *, enabled: bool = True):
+    return _npu_stream_switch(tag, priority) if enabled else nullcontext()
+
+
+def npu_wait_tensor(self: torch.Tensor,
+                    dependency: torch.Tensor,
+                    *,
+                    enabled: bool = True):
+    return _npu_wait_tensor(self, dependency) if enabled else self
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 18d9f3ac5..8dcad248c 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -68,7 +68,7 @@
 from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
-from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is
+from vllm_ascend.utils import ProfileExecuteDuration
 from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
 
 if TYPE_CHECKING:
@@ -1066,8 +1066,8 @@ def execute_model(
                 for tag, duration in durations.items()
             ]
             captured_name = "Decode" if self.attn_state == AscendAttentionState.DecodeOnly else "Prefill"
-            print(f"Profile execute duration [{captured_name}]:",
-                  " ".join(dr_str))
+            logger.info("Profile execute duration [%s]:%s", captured_name,
+                        " ".join(dr_str))
 
         return model_runner_output
 
@@ -1360,44 +1360,27 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         import torch_npu
         kv_caches: Dict[str, torch.Tensor] = {}
 
-        # Remove this after we drop 0.9.0 support
-        if vllm_version_is("0.9.0"):
-            self.input_batch = InputBatch(
-                max_num_reqs=self.max_num_reqs,
-                max_model_len=self.model_config.max_model_len,
-                max_num_batched_tokens=self.max_num_tokens,
-                device=self.device,
-                pin_memory=True,
-                vocab_size=self.model_config.get_vocab_size(),
-                block_size=self.cache_config.block_size,
-            )
-        else:
-            self.input_batch = InputBatch(
-                max_num_reqs=self.max_num_reqs,
-                max_model_len=self.model_config.max_model_len,
-                max_num_batched_tokens=self.max_num_tokens,
-                device=self.device,
-                pin_memory=True,
-                vocab_size=self.model_config.get_vocab_size(),
-                block_sizes=[self.cache_config.block_size],
-            )
+        self.input_batch = InputBatch(
+            max_num_reqs=self.max_num_reqs,
+            max_model_len=self.model_config.max_model_len,
+            max_num_batched_tokens=self.max_num_tokens,
+            device=self.device,
+            pin_memory=True,
+            vocab_size=self.model_config.get_vocab_size(),
+            block_sizes=[self.cache_config.block_size],
+        )
 
-        if not vllm_version_is("0.9.0"):
-            kv_cache_sizes = {}
-            for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
-                assert len(kv_cache_tensor.shared_by) == 1, (
-                    "KV cache tensor shared by multiple layers is not supported in "
-                    "NPU.")
-                kv_cache_sizes[
-                    kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
+        kv_cache_sizes = {}
+        for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+            assert len(kv_cache_tensor.shared_by) == 1, (
+                "KV cache tensor shared by multiple layers is not supported in "
+                "NPU.")
+            kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size
 
         for kv_cache_group in kv_cache_config.kv_cache_groups:
             kv_cache_spec = kv_cache_group.kv_cache_spec
             for layer_name in kv_cache_group.layer_names:
-                if vllm_version_is("0.9.0"):
-                    tensor_size = kv_cache_config.tensors[layer_name].size
-                else:
-                    tensor_size = kv_cache_sizes[layer_name]
+                tensor_size = kv_cache_sizes[layer_name]
                 assert tensor_size % kv_cache_spec.page_size_bytes == 0
                 num_blocks = tensor_size // kv_cache_spec.page_size_bytes
 
diff --git a/vllm_ascend/worker/multi_step_worker.py b/vllm_ascend/worker/multi_step_worker.py
index ba83f6b96..6d092805d 100644
--- a/vllm_ascend/worker/multi_step_worker.py
+++ b/vllm_ascend/worker/multi_step_worker.py
@@ -119,7 +119,7 @@ def _prepare_last_sampled_token_ids_for_tp_workers(
             # execute_model_req
             assert execute_model_req.last_sampled_token_ids is not None
             model_input.last_sampled_token_ids = (
-                execute_model_req.last_sampled_token_ids.cuda())
+                execute_model_req.last_sampled_token_ids.npu())
             model_input.add_sampler_output(
                 SamplerOutput(outputs=[], sampled_token_ids=None),
                 model_input.last_sampled_token_ids)
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
index ba29dcfc5..ebdf01e15 100644
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -117,6 +117,11 @@ def wake_up(self, tags: Optional[list[str]] = None) -> None:
         allocator = CaMemAllocator.get_instance()
         allocator.wake_up(tags=tags)
 
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
     def init_device(self):
         if self.device_config.device.type == "npu":
             self.device = torch.device(f"npu:{self.local_rank_across_dp}")