Skip to content

Commit 16af49c

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents a31e5d8 + 2b16104 commit 16af49c

File tree

73 files changed

+1768
-500
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+1768
-500
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ steps:
6464
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
6565
plugins:
6666
- docker-login#v3.0.0:
67-
username: vllm
67+
username: vllmbot
6868
password-env: DOCKERHUB_TOKEN
6969
env:
7070
DOCKER_BUILDKIT: "1"

.buildkite/scripts/hardware_ci/run-neuron-test.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
1111
HF_CACHE="$(realpath ~)/huggingface"
1212
mkdir -p "${HF_CACHE}"
1313
HF_MOUNT="/root/.cache/huggingface"
14+
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
1415

1516
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
1617
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
1718
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
1819

1920
# Try building the docker image
20-
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
21+
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
2122

2223
# prune old image and containers to save disk space, and only once a day
2324
# by using a timestamp file in tmp.
@@ -47,6 +48,7 @@ trap remove_docker_container EXIT
4748
docker run --rm -it --device=/dev/neuron0 --network bridge \
4849
-v "${HF_CACHE}:${HF_MOUNT}" \
4950
-e "HF_HOME=${HF_MOUNT}" \
51+
-e "HF_TOKEN=${HF_TOKEN}" \
5052
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
5153
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
5254
--name "${container_name}" \

.buildkite/test-pipeline.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ steps:
141141
- vllm/core/
142142
- tests/distributed/test_utils
143143
- tests/distributed/test_pynccl
144+
- tests/distributed/test_events
144145
- tests/spec_decode/e2e/test_integration_dist_tp4
145146
- tests/compile/test_basic_correctness
146147
- examples/offline_inference/rlhf.py
@@ -159,6 +160,7 @@ steps:
159160
- pytest -v -s distributed/test_utils.py
160161
- pytest -v -s compile/test_basic_correctness.py
161162
- pytest -v -s distributed/test_pynccl.py
163+
- pytest -v -s distributed/test_events.py
162164
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
163165
# TODO: create a dedicated test section for multi-GPU example tests
164166
# when we have multiple distributed example tests

csrc/cutlass_extensions/common.hpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,6 @@
1515
cutlassGetStatusString(error)); \
1616
}
1717

18-
/**
19-
* Panic wrapper for unwinding CUDA runtime errors
20-
*/
21-
#define CUDA_CHECK(status) \
22-
{ \
23-
cudaError_t error = status; \
24-
TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
25-
}
26-
2718
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
2819
int max_shared_mem_per_block_opt_in = 0;
2920
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,

csrc/moe/torch_bindings.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
1010

1111
// Calculate the result of moe by summing up the partial results
1212
// from all selected experts.
13-
m.def("moe_sum(Tensor! input, Tensor output) -> ()");
13+
m.def("moe_sum(Tensor input, Tensor! output) -> ()");
1414
m.impl("moe_sum", torch::kCUDA, &moe_sum);
1515

1616
// Aligning the number of tokens to be processed by each expert such

csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
#include <ATen/cuda/CUDAContext.h>
1010

11+
#include "cuda_utils.h"
12+
1113
#include "cutlass/cutlass.h"
1214

1315
#include "cutlass/gemm/device/gemm_universal_adapter.h"
@@ -95,9 +97,9 @@ struct cutlass_sparse_3x_gemm {
9597
// clang-format off
9698
using CollectiveMainloop =
9799
typename cutlass::gemm::collective::CollectiveBuilder<
98-
cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
99-
ElementAB, cutlass::layout::RowMajor, AlignmentAB,
100-
ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
100+
cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
101+
ElementAB, cutlass::layout::RowMajor, AlignmentAB,
102+
ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
101103
ElementAcc, TileShape, ClusterShape,
102104
Stages,
103105
KernelSchedule>::CollectiveOp;

docker/Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ WORKDIR /vllm-workspace
189189
ENV DEBIAN_FRONTEND=noninteractive
190190
ARG TARGETPLATFORM
191191

192+
SHELL ["/bin/bash", "-c"]
193+
192194
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
193195
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
194196

docker/Dockerfile.s390x

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,16 +84,40 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
8484
rustup default stable && \
8585
rustup show
8686

87+
FROM python-install AS torch
88+
ARG TORCH_VERSION=2.7.0
89+
ENV export _GLIBCXX_USE_CXX11_ABI=1
90+
ENV CARGO_HOME=/root/.cargo
91+
ENV RUSTUP_HOME=/root/.rustup
92+
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
93+
94+
WORKDIR /tmp
95+
96+
RUN --mount=type=cache,target=/root/.cache/uv \
97+
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
98+
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
99+
git clone https://github.com/pytorch/pytorch.git && \
100+
cd pytorch && \
101+
git checkout v2.7.0 && \
102+
git submodule sync && \
103+
git submodule update --init --recursive && \
104+
uv pip install cmake ninja && \
105+
uv pip install -r requirements.txt && \
106+
python setup.py bdist_wheel
107+
108+
87109
FROM python-install AS torch-vision
88110
# Install torchvision
89-
ARG TORCH_VERSION=2.7.0.dev20250304
111+
ARG TORCH_VERSION=2.7.0
90112
ARG TORCH_VISION_VERSION=v0.20.1
91113
WORKDIR /tmp
92114
RUN --mount=type=cache,target=/root/.cache/uv \
115+
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
93116
git clone https://github.com/pytorch/vision.git && \
94117
cd vision && \
95118
git checkout $TORCH_VISION_VERSION && \
96-
uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
119+
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
120+
uv pip install -v $TORCH_WHL_FILE && \
97121
python setup.py bdist_wheel
98122

99123
FROM python-install AS hf-xet-builder
@@ -138,15 +162,17 @@ RUN --mount=type=cache,target=/root/.cache/uv \
138162
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
139163
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
140164
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
165+
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
141166
sed -i '/^torch/d' requirements/build.txt && \
142167
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
143168
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
144169
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
170+
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
145171
uv pip install -v \
146172
$ARROW_WHL_FILE \
147173
$VISION_WHL_FILE \
148174
$HF_XET_WHL_FILE \
149-
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
175+
$TORCH_WHL_FILE \
150176
--index-strategy unsafe-best-match \
151177
-r requirements/build.txt \
152178
-r requirements/cpu.txt

docs/source/features/prompt_embeds.md

Lines changed: 2 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -20,59 +20,7 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.EmbedsPromp
2020

2121
You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
2222

23-
```python
24-
from vllm import LLM
25-
import transformers
26-
27-
model_name = "meta-llama/Llama-3.2-1B-Instruct"
28-
29-
# Transformers
30-
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
31-
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
32-
33-
llm = LLM(model=model_name, enable_prompt_embeds=True)
34-
35-
# Refer to the HuggingFace repo for the correct format to use
36-
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
37-
token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
38-
39-
embedding_layer = transformers_model.get_input_embeddings()
40-
prompt_embeds = embedding_layer(token_ids).squeeze(0)
41-
42-
# Single prompt inference
43-
outputs = llm.generate({
44-
"prompt_embeds": prompt_embeds,
45-
})
46-
47-
for o in outputs:
48-
generated_text = o.outputs[0].text
49-
print(generated_text)
50-
51-
# Batch inference
52-
53-
chats = [
54-
[{"role": "user", "content": "Please tell me about the capital of France."}],
55-
[{"role": "user", "content": "When is the day longest during the year?"}],
56-
[{"role": "user", "content": "Where is bigger, the moon or the sun?"}]
57-
]
58-
59-
token_ids_list = [
60-
tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') for chat in chats
61-
]
62-
prompt_embeds_list = [embedding_layer(token_ids).squeeze(0) for token_ids in token_ids_list]
63-
64-
outputs = llm.generate(
65-
[
66-
{
67-
"prompt_embeds": prompt_embeds,
68-
} for prompt_embeds in prompt_embeds_list
69-
]
70-
)
71-
72-
for o in outputs:
73-
generated_text = o.outputs[0].text
74-
print(generated_text)
75-
```
23+
<gh-file:examples/offline_inference/prompt_embed_inference.py>
7624

7725
## Online Serving
7826

@@ -93,52 +41,4 @@ vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \
9341

9442
Then, you can use the OpenAI client as follows:
9543

96-
```python
97-
from openai import OpenAI
98-
import transformers
99-
import torch
100-
101-
openai_api_key = "EMPTY"
102-
openai_api_base = "http://localhost:8000/v1"
103-
104-
client = OpenAI(
105-
api_key=openai_api_key,
106-
base_url=openai_api_base,
107-
)
108-
109-
model_name = "meta-llama/Llama-3.2-1B-Instruct"
110-
111-
# Transformers
112-
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
113-
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
114-
115-
116-
# Refer to the HuggingFace repo for the correct format to use
117-
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
118-
token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
119-
120-
embedding_layer = transformers_model.get_input_embeddings()
121-
prompt_embeds = embedding_layer(token_ids).squeeze(0)
122-
123-
# Prompt embeddings
124-
buffer = io.BytesIO()
125-
torch.save(prompt_embeds, buffer)
126-
buffer.seek(0)
127-
binary_data = buffer.read()
128-
encoded_embeds = base64.b64encode(binary_data).decode('utf-8')
129-
130-
131-
completion = client_with_prompt_embeds.completions.create(
132-
model=model_name,
133-
# NOTE: The OpenAI client does not allow `None` as an input to
134-
# `prompt`. Use an empty string if you have no text prompts.
135-
prompt="",
136-
max_tokens=5,
137-
temperature=0.0,
138-
# NOTE: The OpenAI client allows passing in extra JSON body via the
139-
# `extra_body` argument.
140-
extra_body={"prompt_embeds": encoded_embeds}
141-
)
142-
143-
print(completion.choices[0].text)
144-
```
44+
<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>

docs/source/getting_started/installation/gpu/xpu.inc.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ XPU platform supports **tensor parallel** inference/serving and also supports **
6666
python -m vllm.entrypoints.openai.api_server \
6767
--model=facebook/opt-13b \
6868
--dtype=bfloat16 \
69-
--device=xpu \
7069
--max_model_len=1024 \
7170
--distributed-executor-backend=ray \
7271
--pipeline-parallel-size=2 \

0 commit comments

Comments
 (0)