ROCm
diff --git a/‎.buildkite/release-pipeline.yaml
Lines changed: 1 addition & 1 deletion b/‎.buildkite/release-pipeline.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/scripts/hardware_ci/run-neuron-test.sh
Lines changed: 3 additions & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-neuron-test.sh
Lines changed: 3 additions & 1 deletion
diff --git a/‎.buildkite/test-pipeline.yaml
Lines changed: 2 additions & 0 deletions b/‎.buildkite/test-pipeline.yaml
Lines changed: 2 additions & 0 deletions
diff --git a/‎csrc/cutlass_extensions/common.hpp
Lines changed: 0 additions & 9 deletions b/‎csrc/cutlass_extensions/common.hpp
Lines changed: 0 additions & 9 deletions
diff --git a/‎csrc/moe/torch_bindings.cpp
Lines changed: 1 addition & 1 deletion b/‎csrc/moe/torch_bindings.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
Lines changed: 5 additions & 3 deletions b/‎csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
Lines changed: 5 additions & 3 deletions
diff --git a/‎docker/Dockerfile
Lines changed: 2 additions & 0 deletions b/‎docker/Dockerfile
Lines changed: 2 additions & 0 deletions
diff --git a/‎docker/Dockerfile.s390x
Lines changed: 29 additions & 3 deletions b/‎docker/Dockerfile.s390x
Lines changed: 29 additions & 3 deletions
diff --git a/‎docs/source/features/prompt_embeds.md
Lines changed: 2 additions & 102 deletions b/‎docs/source/features/prompt_embeds.md
Lines changed: 2 additions & 102 deletions
diff --git a/‎docs/source/getting_started/installation/gpu/xpu.inc.md
Lines changed: 0 additions & 1 deletion b/‎docs/source/getting_started/installation/gpu/xpu.inc.md
Lines changed: 0 additions & 1 deletion
@@ -64,7 +64,7 @@ steps:
       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
     plugins:
       - docker-login#v3.0.0:
-          username: vllm
+          username: vllmbot
           password-env: DOCKERHUB_TOKEN
     env:
       DOCKER_BUILDKIT: "1"
 
@@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
+HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
 
 NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
 mkdir -p "${NEURON_COMPILE_CACHE_URL}"
 NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 
 # Try building the docker image
-aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
 
 # prune old image and containers to save disk space, and only once a day
 # by using a timestamp file in tmp.
@@ -47,6 +48,7 @@ trap remove_docker_container EXIT
 docker run --rm -it --device=/dev/neuron0 --network bridge \
        -v "${HF_CACHE}:${HF_MOUNT}" \
        -e "HF_HOME=${HF_MOUNT}" \
+       -e "HF_TOKEN=${HF_TOKEN}" \
        -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
 
@@ -141,6 +141,7 @@ steps:
   - vllm/core/
   - tests/distributed/test_utils
   - tests/distributed/test_pynccl
+  - tests/distributed/test_events
   - tests/spec_decode/e2e/test_integration_dist_tp4
   - tests/compile/test_basic_correctness
   - examples/offline_inference/rlhf.py
@@ -159,6 +160,7 @@ steps:
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
 
@@ -15,15 +15,6 @@
                 cutlassGetStatusString(error));     \
   }
 
-/**
- * Panic wrapper for unwinding CUDA runtime errors
- */
-#define CUDA_CHECK(status)                                        \
-  {                                                               \
-    cudaError_t error = status;                                   \
-    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
-  }
-
 inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
   int max_shared_mem_per_block_opt_in = 0;
   cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
 
@@ -10,7 +10,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
 
   // Calculate the result of moe by summing up the partial results
   // from all selected experts.
-  m.def("moe_sum(Tensor! input, Tensor output) -> ()");
+  m.def("moe_sum(Tensor input, Tensor! output) -> ()");
   m.impl("moe_sum", torch::kCUDA, &moe_sum);
 
   // Aligning the number of tokens to be processed by each expert such
 
@@ -8,6 +8,8 @@
 
 #include <ATen/cuda/CUDAContext.h>
 
+#include "cuda_utils.h"
+
 #include "cutlass/cutlass.h"
 
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
@@ -95,9 +97,9 @@ struct cutlass_sparse_3x_gemm {
   // clang-format off
   using CollectiveMainloop =
       typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, AlignmentAB, 
-          ElementAB, cutlass::layout::ColumnMajor, AlignmentAB, 
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+          ElementAB, cutlass::layout::RowMajor, AlignmentAB,
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
           ElementAcc, TileShape, ClusterShape,
           Stages,
           KernelSchedule>::CollectiveOp;
 
@@ -189,6 +189,8 @@ WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
 
+SHELL ["/bin/bash", "-c"]
+
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
 
@@ -84,16 +84,40 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
     rustup default stable && \
     rustup show
 
+FROM python-install AS torch
+ARG TORCH_VERSION=2.7.0
+ENV export _GLIBCXX_USE_CXX11_ABI=1
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+
+WORKDIR /tmp
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    git clone https://github.com/pytorch/pytorch.git && \
+    cd pytorch && \
+    git checkout v2.7.0 && \
+    git submodule sync && \
+    git submodule update --init --recursive && \
+    uv pip install cmake ninja && \
+    uv pip install -r requirements.txt && \
+    python setup.py bdist_wheel
+    
+
 FROM python-install AS torch-vision
 # Install torchvision
-ARG TORCH_VERSION=2.7.0.dev20250304
+ARG TORCH_VERSION=2.7.0
 ARG TORCH_VISION_VERSION=v0.20.1
 WORKDIR /tmp
 RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
     git clone https://github.com/pytorch/vision.git && \
     cd vision && \
     git checkout $TORCH_VISION_VERSION && \
-    uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
+    TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
+    uv pip install -v $TORCH_WHL_FILE && \
     python setup.py bdist_wheel
 
 FROM python-install AS hf-xet-builder
@@ -138,15 +162,17 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
     --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
     --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
+    --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
      sed -i '/^torch/d' requirements/build.txt && \
      ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
      VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
      HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
+     TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
     uv pip install -v \    
         $ARROW_WHL_FILE  \
         $VISION_WHL_FILE \
         $HF_XET_WHL_FILE \
-        --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
+        $TORCH_WHL_FILE \
         --index-strategy unsafe-best-match \
         -r requirements/build.txt \
         -r requirements/cpu.txt 
 
@@ -20,59 +20,7 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.EmbedsPromp
 
 You can pass prompt embeddings from Hugging Face Transformers models to the  `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
 
-```python
-from vllm import LLM
-import transformers
-
-model_name = "meta-llama/Llama-3.2-1B-Instruct"
-
-# Transformers
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
-transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
-
-llm = LLM(model=model_name, enable_prompt_embeds=True)
-
-# Refer to the HuggingFace repo for the correct format to use
-chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
-token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
-
-embedding_layer = transformers_model.get_input_embeddings()
-prompt_embeds = embedding_layer(token_ids).squeeze(0)
-
-# Single prompt inference
-outputs = llm.generate({
-    "prompt_embeds": prompt_embeds,
-})
-
-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-
-# Batch inference
-
-chats = [
-    [{"role": "user", "content": "Please tell me about the capital of France."}],
-    [{"role": "user", "content": "When is the day longest during the year?"}],
-    [{"role": "user", "content": "Where is bigger, the moon or the sun?"}]
-]
-
-token_ids_list = [
-    tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') for chat in chats
-]
-prompt_embeds_list = [embedding_layer(token_ids).squeeze(0) for token_ids in token_ids_list]
-
-outputs = llm.generate(
-    [
-        {
-            "prompt_embeds": prompt_embeds,
-        } for prompt_embeds in prompt_embeds_list
-    ]
-)
-
-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
+<gh-file:examples/offline_inference/prompt_embed_inference.py>
 
 ## Online Serving
 
@@ -93,52 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \
 
 Then, you can use the OpenAI client as follows:
 
-```python
-from openai import OpenAI
-import transformers
-import torch
-
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-model_name = "meta-llama/Llama-3.2-1B-Instruct"
-
-# Transformers
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
-transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
-
-
-# Refer to the HuggingFace repo for the correct format to use
-chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
-token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
-
-embedding_layer = transformers_model.get_input_embeddings()
-prompt_embeds = embedding_layer(token_ids).squeeze(0)
-
-# Prompt embeddings
-buffer = io.BytesIO()
-torch.save(prompt_embeds, buffer)
-buffer.seek(0)
-binary_data = buffer.read()
-encoded_embeds = base64.b64encode(binary_data).decode('utf-8')
-
-
-completion = client_with_prompt_embeds.completions.create(
-    model=model_name,
-    # NOTE: The OpenAI client does not allow `None` as an input to 
-    # `prompt`. Use an empty string if you have no text prompts.
-    prompt="",  
-    max_tokens=5,
-    temperature=0.0,
-    # NOTE: The OpenAI client allows passing in extra JSON body via the
-    # `extra_body` argument.
-    extra_body={"prompt_embeds": encoded_embeds}
-)
-
-print(completion.choices[0].text)
-```
+<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
@@ -66,7 +66,6 @@ XPU platform supports **tensor parallel** inference/serving and also supports **
 python -m vllm.entrypoints.openai.api_server \
      --model=facebook/opt-13b \
      --dtype=bfloat16 \
-     --device=xpu \
      --max_model_len=1024 \
      --distributed-executor-backend=ray \
      --pipeline-parallel-size=2 \