character-tech · tanujtiwari1998 · Jul 3, 2025 · Jul 3, 2025 · Jul 3, 2025 · Jul 3, 2025
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -11,8 +11,8 @@ container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head
 docker build -t ${image_name} -f docker/Dockerfile.xpu .
 
 # Setup cleanup
-remove_docker_container() { 
-  docker rm -f "${container_name}" || true; 
+remove_docker_container() {
+  docker rm -f "${container_name}" || true;
   docker image rm -f "${image_name}" || true;
   docker system prune -f || true;
 }
@@ -27,4 +27,6 @@ docker run \
     "${image_name}" \
     sh -c '
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    cd tests
+    pytest -v -s v1/core
 '
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -170,7 +170,7 @@ repos:
   # Keep `suggestion` last
   - id: suggestion
     name: Suggestion
-    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
+    entry: bash -c 'echo "To bypass all the pre-commit hooks, add --no-verify to git commit. To skip a specific hook, prefix the commit command with SKIP=<hook-id>."'
     language: system
     verbose: true
     pass_filenames: false

diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
@@ -191,6 +191,33 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor? azp) -> ()");
   ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
 
+  // Compute int8 quantized tensor and scaling factor
+  ops.def(
+      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
+      "Tensor!? azp) -> ()");
+  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
+           &dynamic_scaled_int8_quant);
+  // W8A8 GEMM, supporting symmetric quantization.
+  ops.def(
+      "cutlass_scaled_mm(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm_ppc64le);
+  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
+  // quantization.
+  ops.def(
+      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
+      "                  Tensor b, Tensor a_scales,"
+      "                  Tensor b_scales, Tensor azp_adj,"
+      "                  Tensor? azp, Tensor? bias) -> ()");
+  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
+#elif defined(__powerpc64__)
+  // Compute int8 quantized tensor for given scaling factor.
+  ops.def(
+      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
+      "Tensor? azp) -> ()");
+  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
+
   // Compute int8 quantized tensor and scaling factor
   ops.def(
       "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "

diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -294,6 +294,29 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
       version_num, ". Required capability: 90 or 100");
 }
 
+void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
+                                  torch::Tensor& problem_sizes1,
+                                  torch::Tensor& problem_sizes2,
+                                  const torch::Tensor& expert_num_tokens,
+                                  const int64_t num_local_experts,
+                                  const int64_t padded_m, const int64_t n,
+                                  const int64_t k) {
+  // This function currently gets compiled only if we have a valid cutlass moe
+  // mm to run it for.
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+  get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
+                                      problem_sizes2, expert_num_tokens,
+                                      num_local_experts, padded_m, n, k);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
+      "for CUDA device capability: ",
+      version_num, ". Required capability: 90");
+}
+
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,

diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
@@ -47,7 +47,7 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0'
+    pip install accelerate hf_transfer pytest 'modelscope!=1.15.0'
 
 ENV VLLM_USAGE_SOURCE production-docker-image \
     TRITON_XPU_PROFILE 1

diff --git a/docs/ci/update_pytorch_version.md b/docs/ci/update_pytorch_version.md
@@ -7,9 +7,8 @@ release in CI/CD. It is standard practice to submit a PR to update the
 PyTorch version as early as possible when a new [PyTorch stable
 release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
 This process is non-trivial due to the gap between PyTorch
-releases. Using [#16859](https://github.com/vllm-project/vllm/pull/16859) as
-an example, this document outlines common steps to achieve this update along with
-a list of potential issues and how to address them.
+releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
+update along with a list of potential issues and how to address them.
 
 ## Test PyTorch release candidates (RCs)
 
@@ -68,7 +67,7 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
 it doesn't populate the cache, so re-running it to warm up the cache
 is ineffective.
 
-While ongoing efforts like [#17419](https://github.com/vllm-project/vllm/issues/17419)
+While ongoing efforts like [#17419](gh-issue:17419)
 address the long build time at its source, the current workaround is to set VLLM_CI_BRANCH
 to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
 when manually triggering a build on Buildkite. This branch accomplishes two things:
@@ -129,6 +128,5 @@ to handle some platforms separately. The separation of requirements and Dockerfi
 for different platforms in vLLM CI/CD allows us to selectively choose
 which platforms to update. For instance, updating XPU requires the corresponding
 release from https://github.com/intel/intel-extension-for-pytorch by Intel.
-While https://github.com/vllm-project/vllm/pull/16859 updated vLLM to PyTorch
-2.7.0 on CPU, CUDA, and ROCm, https://github.com/vllm-project/vllm/pull/17444
-completed the update for XPU.
+While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
+<gh-pr:17444> completed the update for XPU.
diff --git a/docs/contributing/ci-failures.md b/docs/contributing/ci-failures.md
@@ -6,9 +6,9 @@ the failure?
 - Check the dashboard of current CI test failures:  
   👉 [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20)
 
-- If your failure **is already listed**, it's likely unrelated to your PR.  
-  Help fixing it is always welcome!  
-    - Leave comments with links to additional instances of the failure.  
+- If your failure **is already listed**, it's likely unrelated to your PR.
+  Help fixing it is always welcome!
+    - Leave comments with links to additional instances of the failure.
     - React with a 👍 to signal how many are affected.
 
 - If your failure **is not listed**, you should **file an issue**.
@@ -19,43 +19,43 @@ the failure?
     👉 [New CI Failure Report](https://github.com/vllm-project/vllm/issues/new?template=450-ci-failure.yml)
 
 - **Use this title format:**
-  
+
     ```
     [CI Failure]: failing-test-job - regex/matching/failing:test
     ```
 
 - **For the environment field:**
-  
+
     ```
  Still failing on main as of commit abcdef123
     ```
 
 - **In the description, include failing tests:**
-  
+
     ```
-    FAILED failing/test.py:failing_test1 - Failure description  
-     FAILED failing/test.py:failing_test2 - Failure description  
-    https://github.com/orgs/vllm-project/projects/20  
-    https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml  
-    FAILED failing/test.py:failing_test3 - Failure description  
+    FAILED failing/test.py:failing_test1 - Failure description
+    FAILED failing/test.py:failing_test2 - Failure description
+    https://github.com/orgs/vllm-project/projects/20
+    https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml
+    FAILED failing/test.py:failing_test3 - Failure description
     ```
 
 - **Attach logs** (collapsible section example):
     <details>
     <summary>Logs:</summary>
 
     ```text
-    ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data  
+    ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data
     --- Logging error ---  
     Traceback (most recent call last):  
       File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 203, in execute_model  
-        return self.model_executor.execute_model(scheduler_output)  
+        return self.model_executor.execute_model(scheduler_output)
     ...
-    FAILED failing/test.py:failing_test1 - Failure description  
-    FAILED failing/test.py:failing_test2 - Failure description  
-    FAILED failing/test.py:failing_test3 - Failure description  
+    FAILED failing/test.py:failing_test1 - Failure description
+    FAILED failing/test.py:failing_test2 - Failure description
+    FAILED failing/test.py:failing_test3 - Failure description
     ```
-  
+
     </details>
 
 ## Logs Wrangling
@@ -78,7 +78,7 @@ tail -525 ci_build.log | wl-copy
 
 ## Investigating a CI Test Failure
 
-1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main)  
+1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main)
 2. Bisect to find the first build that shows the issue.  
 3. Add your findings to the GitHub issue.  
 4. If you find a strong candidate PR, mention it in the issue and ping contributors.
@@ -97,9 +97,9 @@ CI test failures may be flaky. Use a bash loop to run repeatedly:
 
 If you submit a PR to fix a CI failure:
 
-- Link the PR to the issue:  
+- Link the PR to the issue:
   Add `Closes #12345` to the PR description.
-- Add the `ci-failure` label:  
+- Add the `ci-failure` label:
   This helps track it in the [CI Failures GitHub Project](https://github.com/orgs/vllm-project/projects/20).
 
 ## Other Resources

diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
@@ -217,8 +217,8 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
 A few important things to consider when using the EAGLE based draft models:
 
 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
-   be able to be loaded and used directly by vLLM after [PR 12304](https://github.com/vllm-project/vllm/pull/12304).
-   If you are using vllm version before [PR 12304](https://github.com/vllm-project/vllm/pull/12304), please use the
+   be able to be loaded and used directly by vLLM after <gh-pr:12304>.
+   If you are using vllm version before <gh-pr:12304>, please use the
    [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
    and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
 
@@ -228,7 +228,7 @@ A few important things to consider when using the EAGLE based draft models:
 
 3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
    reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
-   investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565).
+   investigation and tracked here: <gh-issue:9565>.
 
 A variety of EAGLE draft models are available on the Hugging Face hub:
 

diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
@@ -157,7 +157,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
     print(completion.choices[0].message.content)
     ```
 
-See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
+See also: [full example](../examples/online_serving/structured_outputs.md)
 
 ## Reasoning Outputs
 
@@ -200,7 +200,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th
     print("content: ", completion.choices[0].message.content)
     ```
 
-See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
+See also: [full example](../examples/online_serving/structured_outputs.md)
 
 ## Experimental Automatic Parsing (OpenAI API)
 
@@ -325,4 +325,4 @@ shown below:
     print(outputs[0].outputs[0].text)
     ```
 
-See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
+See also: [full example](../examples/online_serving/structured_outputs.md)
diff --git a/docs/getting_started/installation/ai_accelerator.md b/docs/getting_started/installation/ai_accelerator.md
@@ -0,0 +1,117 @@
+# Other AI accelerators
+
+vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions:
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:installation"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:installation"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:installation"
+
+## Requirements
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:requirements"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:requirements"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:requirements"
+
+## Configure a new environment
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:configure-a-new-environment"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:configure-a-new-environment"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:configure-a-new-environment"
+
+## Set up using Python
+
+### Pre-built wheels
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-wheels"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-wheels"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-wheels"
+
+### Build wheel from source
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-wheel-from-source"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-wheel-from-source"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-wheel-from-source"
+
+## Set up using Docker
+
+### Pre-built images
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-images"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-images"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-images"
+
+### Build image from source
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-image-from-source"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-image-from-source"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-image-from-source"
+
+## Extra information
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:extra-information"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:extra-information"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:extra-information"