diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
index 039c0332c5..ec70f6dec7 100644
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -66,137 +66,137 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  lint:
-    # Only trigger lint on pull request
-    if: ${{ github.event_name == 'pull_request' }}
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.10"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements-lint.txt
-      - name: Run codespell check
-        run: |
-          CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**')
-          CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn')
-
-          codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}"
-      - name: Analysing the code with ruff
-        run: |
-          echo "::add-matcher::.github/workflows/matchers/ruff.json"
-          ruff check --output-format github .
-      - name: Run isort
-        run: |
-          isort . --check-only
-      - name: Running yapf
-        run: |
-          python -m pip install --upgrade pip
-          pip install toml
-          pip install yapf==0.32.0
-          yapf --diff --recursive .
-
-      - name: Install dependencies
-        run: |
-          pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu
-
-      - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          path: vllm-empty
-
-      - name: Actionlint Check
-        env:
-          SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
-        run: |
-          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
-          tools/actionlint.sh -color
-
-      - name: Install vllm-project/vllm from source
-        working-directory: vllm-empty
-        run: |
-          pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-          VLLM_TARGET_DEVICE=empty pip install .
-
-      - name: Mypy Check
-        run: |
-          echo "::add-matcher::.github/workflows/matchers/mypy.json"
-          tools/mypy.sh 1 ${{ matrix.python-version }}
-
-  ut:
-    needs: [lint]
-    name: unit test
-    # only trigger e2e test on [pull request after lint passed] and [merged commit]
-    if: ${{ needs.lint.result == 'success' || github.event_name == 'push' }}
-    runs-on: ubuntu-latest
-    container:
-      image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
-      env:
-        VLLM_LOGGING_LEVEL: ERROR
-        VLLM_USE_MODELSCOPE: True
-    strategy:
-      matrix:
-        vllm_version: [main, v0.9.1]
-    steps:
-      - name: Install packages
-        run: |
-          apt-get update -y
-          apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2
-
-      - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          ref: ${{ matrix.vllm_version }}
-          path: ./vllm-empty
-
-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
-        run: |
-          VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/
-          python3 -m pip uninstall -y triton
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v4
-
-      - name: Install vllm-project/vllm-ascend
-        run: |
-          export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
-          python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
-          python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/
-
-      - name: Run unit test for V1 Engine
-        env:
-          VLLM_USE_V1: 1
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
-          TORCH_DEVICE_BACKEND_AUTOLOAD: 0
-        run: |
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
-          pytest -sv --cov --cov-report=xml:unittests-coverage.xml tests/ut
-
-      - name: Upload coverage to Codecov
-        if: ${{ matrix.vllm_version == 'main' }}
-        uses: codecov/codecov-action@v5
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-        with:
-          flags: unittests
-          name: vllm-ascend
-          verbose: true
+  # lint:
+  #   # Only trigger lint on pull request
+  #   if: ${{ github.event_name == 'pull_request' }}
+  #   runs-on: ubuntu-latest
+  #   strategy:
+  #     matrix:
+  #       python-version: ["3.10"]
+  #   steps:
+  #     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+  #     - name: Set up Python ${{ matrix.python-version }}
+  #       uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+  #       with:
+  #         python-version: ${{ matrix.python-version }}
+  #     - name: Install dependencies
+  #       run: |
+  #         python -m pip install --upgrade pip
+  #         pip install -r requirements-lint.txt
+  #     - name: Run codespell check
+  #       run: |
+  #         CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**')
+  #         CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn')
+
+  #         codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}"
+  #     - name: Analysing the code with ruff
+  #       run: |
+  #         echo "::add-matcher::.github/workflows/matchers/ruff.json"
+  #         ruff check --output-format github .
+  #     - name: Run isort
+  #       run: |
+  #         isort . --check-only
+  #     - name: Running yapf
+  #       run: |
+  #         python -m pip install --upgrade pip
+  #         pip install toml
+  #         pip install yapf==0.32.0
+  #         yapf --diff --recursive .
+
+  #     - name: Install dependencies
+  #       run: |
+  #         pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu
+
+  #     - name: Checkout vllm-project/vllm repo
+  #       uses: actions/checkout@v4
+  #       with:
+  #         repository: vllm-project/vllm
+  #         path: vllm-empty
+
+  #     - name: Actionlint Check
+  #       env:
+  #         SHELLCHECK_OPTS: --exclude=SC2046,SC2006,SC2086
+  #       run: |
+  #         echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+  #         tools/actionlint.sh -color
+
+  #     - name: Install vllm-project/vllm from source
+  #       working-directory: vllm-empty
+  #       run: |
+  #         pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+  #         VLLM_TARGET_DEVICE=empty pip install .
+
+  #     - name: Mypy Check
+  #       run: |
+  #         echo "::add-matcher::.github/workflows/matchers/mypy.json"
+  #         tools/mypy.sh 1 ${{ matrix.python-version }}
+
+  # ut:
+  #   needs: [lint]
+  #   name: unit test
+  #   # only trigger e2e test on [pull request after lint passed] and [merged commit]
+  #   if: ${{ needs.lint.result == 'success' || github.event_name == 'push' }}
+  #   runs-on: ubuntu-latest
+  #   container:
+  #     image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
+  #     env:
+  #       VLLM_LOGGING_LEVEL: ERROR
+  #       VLLM_USE_MODELSCOPE: True
+  #   strategy:
+  #     matrix:
+  #       vllm_version: [main, v0.9.1]
+  #   steps:
+  #     - name: Install packages
+  #       run: |
+  #         apt-get update -y
+  #         apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2
+
+  #     - name: Checkout vllm-project/vllm repo
+  #       uses: actions/checkout@v4
+  #       with:
+  #         repository: vllm-project/vllm
+  #         ref: ${{ matrix.vllm_version }}
+  #         path: ./vllm-empty
+
+  #     - name: Install vllm-project/vllm from source
+  #       working-directory: ./vllm-empty
+  #       run: |
+  #         VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/
+  #         python3 -m pip uninstall -y triton
+
+  #     - name: Checkout vllm-project/vllm-ascend repo
+  #       uses: actions/checkout@v4
+
+  #     - name: Install vllm-project/vllm-ascend
+  #       run: |
+  #         export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
+  #         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
+  #         python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
+  #         python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/
+
+  #     - name: Run unit test for V1 Engine
+  #       env:
+  #         VLLM_USE_V1: 1
+  #         VLLM_WORKER_MULTIPROC_METHOD: spawn
+  #         TORCH_DEVICE_BACKEND_AUTOLOAD: 0
+  #       run: |
+  #         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
+  #         pytest -sv --cov --cov-report=xml:unittests-coverage.xml tests/ut
+
+  #     - name: Upload coverage to Codecov
+  #       if: ${{ matrix.vllm_version == 'main' }}
+  #       uses: codecov/codecov-action@v5
+  #       env:
+  #         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+  #       with:
+  #         flags: unittests
+  #         name: vllm-ascend
+  #         verbose: true
 
   e2e:
-    needs: [lint]
+    #needs: [lint]
     # only trigger e2e test on pull request after lint passed
-    if: ${{ needs.lint.result == 'success' && github.event_name == 'pull_request' }}
+    if: ${{ github.event_name == 'pull_request' }}
     strategy:
       max-parallel: 2
       matrix:
@@ -256,19 +256,20 @@ jobs:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
           VLLM_USE_MODELSCOPE: True
         run: |
-          pytest -sv tests/e2e/singlecard/test_offline_inference.py
-          # TODO: switch hf to modelscope
-          VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
-            pytest -sv tests/e2e/singlecard/test_ilama_lora.py
           pytest -sv tests/e2e/singlecard/test_guided_decoding.py
-          pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/test_embedding.py
-          pytest -sv tests/e2e/singlecard/ \
-          --ignore=tests/e2e/singlecard/test_offline_inference.py \
-          --ignore=tests/e2e/singlecard/test_ilama_lora.py \
-          --ignore=tests/e2e/singlecard/test_guided_decoding.py \
-          --ignore=tests/e2e/singlecard/test_camem.py \
-          --ignore=tests/e2e/singlecard/test_embedding.py
+          # pytest -sv tests/e2e/singlecard/test_offline_inference.py
+          # # TODO: switch hf to modelscope
+          # VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
+          #   pytest -sv tests/e2e/singlecard/test_ilama_lora.py
+          # pytest -sv tests/e2e/singlecard/test_guided_decoding.py
+          # pytest -sv tests/e2e/singlecard/test_camem.py
+          # pytest -sv tests/e2e/singlecard/test_embedding.py
+          # pytest -sv tests/e2e/singlecard/ \
+          # --ignore=tests/e2e/singlecard/test_offline_inference.py \
+          # --ignore=tests/e2e/singlecard/test_ilama_lora.py \
+          # --ignore=tests/e2e/singlecard/test_guided_decoding.py \
+          # --ignore=tests/e2e/singlecard/test_camem.py \
+          # --ignore=tests/e2e/singlecard/test_embedding.py
 
       - name: Run e2e test on V0 engine
         if: ${{ github.event_name == 'schedule' }}
@@ -276,120 +277,121 @@ jobs:
           VLLM_USE_V1: 0
           VLLM_USE_MODELSCOPE: True
         run: |
-          pytest -sv tests/e2e/singlecard/test_offline_inference.py
-          # TODO: switch hf to modelscope
-          VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
-            pytest -sv tests/e2e/singlecard/test_ilama_lora.py
           pytest -sv tests/e2e/singlecard/test_guided_decoding.py
-          pytest -sv tests/e2e/singlecard/test_camem.py
-          pytest -sv tests/e2e/singlecard/test_prompt_embedding.py
-          pytest -sv tests/e2e/singlecard/test_embedding.py
-          pytest -sv tests/e2e/singlecard/ \
-            --ignore=tests/e2e/singlecard/test_offline_inference.py \
-            --ignore=tests/e2e/singlecard/test_ilama_lora.py \
-            --ignore=tests/e2e/singlecard/test_guided_decoding.py \
-            --ignore=tests/e2e/singlecard/test_camem.py \
-            --ignore=tests/e2e/singlecard/test_prompt_embedding.py \
-            --ignore=tests/e2e/singlecard/core/test_ascend_scheduler.py \
-            --ignore=tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py \
-            --ignore=tests/e2e/singlecard/test_embedding.py
-
-  e2e-4-cards:
-    needs: [e2e]
-    if: ${{ needs.e2e.result == 'success' }}
-    strategy:
-      max-parallel: 1
-      matrix:
-        os: [linux-arm64-npu-4]
-        vllm_version: [main, v0.9.1]
-    name: multicard e2e test
-    runs-on: ${{ matrix.os }}
-    container:
-      # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
-      image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
-      env:
-        VLLM_LOGGING_LEVEL: ERROR
-        VLLM_USE_MODELSCOPE: True
-    steps:
-      - name: Check npu and CANN info
-        run: |
-          npu-smi info
-          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
-
-      - name: Config mirrors
-        run: |
-          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
-          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-          apt-get update -y
-          apt install git -y
-          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v4
-
-      - name: Install system dependencies
-        run: |
-          apt-get -y install `cat packages.txt`
-          apt-get -y install gcc g++ cmake libnuma-dev
-
-      - name: Checkout vllm-project/vllm repo
-        uses: actions/checkout@v4
-        with:
-          repository: vllm-project/vllm
-          ref: ${{ matrix.vllm_version }}
-          path: ./vllm-empty
-
-      - name: Install vllm-project/vllm from source
-        working-directory: ./vllm-empty
-        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
-
-      - name: Install vllm-project/vllm-ascend
-        env:
-          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
-        run: |
-          pip install -r requirements-dev.txt
-          pip install -v -e .
-
-      - name: Run vllm-project/vllm-ascend test for V1 Engine
-        env:
-          VLLM_USE_V1: 1
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          # TODO: switch hf to modelscope
-          VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
-            pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
-          # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
-          # To avoid oom, we need to run the test in a single process.
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
-          pytest -sv tests/e2e/multicard/test_data_parallel.py
-          pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
-            --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
-            --ignore=tests/e2e/multicard/test_data_parallel.py
-
-      - name: Run vllm-project/vllm-ascend test on V0 engine
-        if: ${{ github.event_name == 'schedule' }}
-        env:
-          VLLM_USE_V1: 0
-          VLLM_USE_MODELSCOPE: True
-        run: |
-          # TODO: switch hf to modelscope
-          VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
-            pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
-          # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
-          # To avoid oom, we need to run the test in a single process.
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
-          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
-          pytest -sv tests/e2e/multicard/test_data_parallel.py
-          pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
-            --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
-            --ignore=tests/e2e/multicard/test_data_parallel.py
+          # pytest -sv tests/e2e/singlecard/test_offline_inference.py
+          # # TODO: switch hf to modelscope
+          # VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
+          #   pytest -sv tests/e2e/singlecard/test_ilama_lora.py
+          # pytest -sv tests/e2e/singlecard/test_guided_decoding.py
+          # pytest -sv tests/e2e/singlecard/test_camem.py
+          # pytest -sv tests/e2e/singlecard/test_prompt_embedding.py
+          # pytest -sv tests/e2e/singlecard/test_embedding.py
+          # pytest -sv tests/e2e/singlecard/ \
+          #   --ignore=tests/e2e/singlecard/test_offline_inference.py \
+          #   --ignore=tests/e2e/singlecard/test_ilama_lora.py \
+          #   --ignore=tests/e2e/singlecard/test_guided_decoding.py \
+          #   --ignore=tests/e2e/singlecard/test_camem.py \
+          #   --ignore=tests/e2e/singlecard/test_prompt_embedding.py \
+          #   --ignore=tests/e2e/singlecard/core/test_ascend_scheduler.py \
+          #   --ignore=tests/e2e/singlecard/core/test_ascend_scheduler_e2e.py \
+          #   --ignore=tests/e2e/singlecard/test_embedding.py
+
+  # e2e-4-cards:
+  #   needs: [e2e]
+  #   if: ${{ needs.e2e.result == 'success' }}
+  #   strategy:
+  #     max-parallel: 1
+  #     matrix:
+  #       os: [linux-arm64-npu-4]
+  #       vllm_version: [main, v0.9.1]
+  #   name: multicard e2e test
+  #   runs-on: ${{ matrix.os }}
+  #   container:
+  #     # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
+  #     image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
+  #     env:
+  #       VLLM_LOGGING_LEVEL: ERROR
+  #       VLLM_USE_MODELSCOPE: True
+  #   steps:
+  #     - name: Check npu and CANN info
+  #       run: |
+  #         npu-smi info
+  #         cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
+  #     - name: Config mirrors
+  #       run: |
+  #         sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+  #         pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+  #         apt-get update -y
+  #         apt install git -y
+  #         git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
+
+  #     - name: Checkout vllm-project/vllm-ascend repo
+  #       uses: actions/checkout@v4
+
+  #     - name: Install system dependencies
+  #       run: |
+  #         apt-get -y install `cat packages.txt`
+  #         apt-get -y install gcc g++ cmake libnuma-dev
+
+  #     - name: Checkout vllm-project/vllm repo
+  #       uses: actions/checkout@v4
+  #       with:
+  #         repository: vllm-project/vllm
+  #         ref: ${{ matrix.vllm_version }}
+  #         path: ./vllm-empty
+
+  #     - name: Install vllm-project/vllm from source
+  #       working-directory: ./vllm-empty
+  #       run: |
+  #         VLLM_TARGET_DEVICE=empty pip install -e .
+
+  #     - name: Install vllm-project/vllm-ascend
+  #       env:
+  #         PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
+  #       run: |
+  #         pip install -r requirements-dev.txt
+  #         pip install -v -e .
+
+  #     - name: Run vllm-project/vllm-ascend test for V1 Engine
+  #       env:
+  #         VLLM_USE_V1: 1
+  #         VLLM_WORKER_MULTIPROC_METHOD: spawn
+  #         VLLM_USE_MODELSCOPE: True
+  #       run: |
+  #         # TODO: switch hf to modelscope
+  #         VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
+  #           pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
+  #         # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
+  #         # To avoid oom, we need to run the test in a single process.
+  #         pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
+  #         pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
+  #         pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
+  #         pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
+  #         pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
+  #         pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
+  #         pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
+  #         pytest -sv tests/e2e/multicard/test_data_parallel.py
+  #         pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
+  #           --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
+  #           --ignore=tests/e2e/multicard/test_data_parallel.py
+
+  #     - name: Run vllm-project/vllm-ascend test on V0 engine
+  #       if: ${{ github.event_name == 'schedule' }}
+  #       env:
+  #         VLLM_USE_V1: 0
+  #         VLLM_USE_MODELSCOPE: True
+  #       run: |
+  #         # TODO: switch hf to modelscope
+  #         VLLM_USE_MODELSCOPE=False HF_ENDPOINT=https://hf-mirror.com \
+  #           pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
+  #         # Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
+  #         # To avoid oom, we need to run the test in a single process.
+  #         pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
+  #         pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek
+  #         pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
+  #         pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
+  #         pytest -sv tests/e2e/multicard/test_data_parallel.py
+  #         pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
+  #           --ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
+  #           --ignore=tests/e2e/multicard/test_data_parallel.py
diff --git a/tests/conftest.py b/tests/conftest.py
index efdd8496c1..50172fee4d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -73,8 +73,16 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
         import ray  # Lazy import Ray
         ray.shutdown()
     gc.collect()
-    torch.npu.empty_cache()
-    torch.npu.reset_peak_memory_stats()
+    from vllm.platforms import current_platform
+    empty_cache = current_platform.empty_cache
+    if empty_cache is not None:
+        empty_cache()
+    try:
+        if not current_platform.is_cpu():
+            torch._C._host_emptyCache()
+    except AttributeError:
+        logger.warning(
+            "torch._C._host_emptyCache() only available in Pytorch >=2.5")
 
 
 class VllmRunner:
diff --git a/tests/e2e/singlecard/test_guided_decoding.py b/tests/e2e/singlecard/test_guided_decoding.py
index 9d103a5308..43d3398d4d 100644
--- a/tests/e2e/singlecard/test_guided_decoding.py
+++ b/tests/e2e/singlecard/test_guided_decoding.py
@@ -24,8 +24,10 @@
 import pytest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+from vllm.entrypoints.llm import LLM
 
-from tests.conftest import VllmRunner
+from tests.conftest import VllmRunner, cleanup_dist_env_and_memory
+import weakref
 
 os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
 MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
@@ -35,6 +37,17 @@
 GuidedDecodingBackend = list(
     set(GuidedDecodingBackendV0 + GuidedDecodingBackendV1))
 
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME, max_model_len=1024, seed=0, enforce_eager=True, dtype="half")
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+        del llm
+    cleanup_dist_env_and_memory()
+
 
 @pytest.fixture(scope="module")
 def sample_regex():
@@ -94,32 +107,24 @@ def check_backend(guided_decoding_backend: str):
 
 
 @pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
-def test_guided_json_completion(guided_decoding_backend: str,
+def test_guided_json_completion(guided_decoding_backend: str, llm,
                                 sample_json_schema):
     check_backend(guided_decoding_backend)
 
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=500,
-        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
-
-    with VllmRunner(
-            MODEL_NAME,
-            seed=0,
-            dtype="auto",
-            guided_decoding_backend=guided_decoding_backend,
-    ) as vllm_model:
-        prompts = [
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema, backend=guided_decoding_backend))
+
+    prompts = [
             f"Give an example JSON for an employee profile "
             f"that fits this schema: {sample_json_schema}"
         ] * 2
-        inputs = vllm_model.get_inputs(prompts)
-        outputs = vllm_model.model.generate(inputs,
-                                            sampling_params=sampling_params)
+    outputs = llm.generate(prompts, sampling_params=sampling_params, use_tqdm=True)
 
-        assert outputs is not None
+    assert outputs is not None
 
-        for output in outputs:
+    for output in outputs:
             assert output is not None
             assert isinstance(output, RequestOutput)
             prompt = output.prompt
@@ -133,28 +138,20 @@ def test_guided_json_completion(guided_decoding_backend: str,
 
 
 @pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
-def test_guided_regex(guided_decoding_backend: str, sample_regex):
+def test_guided_regex(guided_decoding_backend: str, sample_regex, llm):
     check_backend(guided_decoding_backend)
 
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
-        guided_decoding=GuidedDecodingParams(regex=sample_regex))
-
-    with VllmRunner(
-            MODEL_NAME,
-            seed=0,
-            dtype="auto",
-            guided_decoding_backend=guided_decoding_backend,
-    ) as vllm_model:
-        prompts = [
+        guided_decoding=GuidedDecodingParams(regex=sample_regex, backend=guided_decoding_backend))
+
+    prompts = [
             f"Give an example IPv4 address with this regex: {sample_regex}"
         ] * 2
-        inputs = vllm_model.get_inputs(prompts)
-        outputs = vllm_model.model.generate(inputs,
-                                            sampling_params=sampling_params)
-        assert outputs is not None
-        for output in outputs:
+    outputs = llm.generate(prompts, sampling_params=sampling_params, use_tqdm=True)
+    assert outputs is not None
+    for output in outputs:
             assert output is not None
             assert isinstance(output, RequestOutput)
             prompt = output.prompt