Skip to content

Commit e94c760

Browse files
authored
Merge pull request #550 from ROCm/upstream_merge_2025_05_19
Upstream merge 2025 05 19
2 parents 16d2b92 + e34fd18 commit e94c760

File tree

67 files changed

+2068
-615
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+2068
-615
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ steps:
1414
agents:
1515
queue: cpu_queue_postmerge
1616
commands:
17-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
17+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
1818
- "mkdir artifacts"
1919
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
2020
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -31,7 +31,7 @@ steps:
3131
agents:
3232
queue: cpu_queue_postmerge
3333
commands:
34-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
34+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
3535
- "mkdir artifacts"
3636
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
3737
- "bash .buildkite/scripts/upload-wheels.sh"

.github/mergify.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,17 @@ pull_request_rules:
163163
164164
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
165165
166+
- name: assign reviewer for tensorizer changes
167+
conditions:
168+
- files~=^vllm/model_executor/model_loader/tensorizer.py
169+
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
170+
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
171+
- files~=^tests/tensorizer_loader/
172+
actions:
173+
assign:
174+
users:
175+
- "sangstar"
176+
166177
- name: remove 'needs-rebase' label when conflict is resolved
167178
conditions:
168179
- -conflict

csrc/moe/marlin_moe_wna16/marlin_template.h

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1767,17 +1767,20 @@ __global__ void Marlin(
17671767

17681768
if constexpr (has_act_order) {
17691769
slice_k_start += tb_k * stages;
1770-
slice_k_start_shared_fetch += tb_k * stages;
1771-
int first_group_id = g_idx[slice_k_start];
1772-
int last_g_idx = slice_k_start + stages * tb_k * 2;
1773-
if (last_g_idx >= prob_k) {
1774-
last_g_idx = prob_k - 1;
1775-
}
1776-
int last_group_id = g_idx[last_g_idx];
1777-
if (last_group_id >= sh_first_group_id + sh_num_groups) {
1778-
fetch_act_order_scales_to_shared(false, first_group_id,
1779-
last_group_id);
1780-
__syncthreads();
1770+
1771+
if (slice_k_start < prob_k) {
1772+
slice_k_start_shared_fetch += tb_k * stages;
1773+
int first_group_id = g_idx[slice_k_start];
1774+
int last_g_idx = slice_k_start + stages * tb_k * 2;
1775+
if (last_g_idx >= prob_k) {
1776+
last_g_idx = prob_k - 1;
1777+
}
1778+
int last_group_id = g_idx[last_g_idx];
1779+
if (last_group_id >= sh_first_group_id + sh_num_groups) {
1780+
fetch_act_order_scales_to_shared(false, first_group_id,
1781+
last_group_id);
1782+
__syncthreads();
1783+
}
17811784
}
17821785
}
17831786
if (slice_iters == 0) {

csrc/quantization/gptq_marlin/marlin_template.h

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1588,16 +1588,20 @@ __global__ void Marlin(
15881588

15891589
if constexpr (has_act_order) {
15901590
slice_k_start += tb_k * stages;
1591-
slice_k_start_shared_fetch += tb_k * stages;
1592-
int first_group_id = g_idx[slice_k_start];
1593-
int last_g_idx = slice_k_start + stages * tb_k * 2;
1594-
if (last_g_idx >= prob_k) {
1595-
last_g_idx = prob_k - 1;
1596-
}
1597-
int last_group_id = g_idx[last_g_idx];
1598-
if (last_group_id >= sh_first_group_id + sh_num_groups) {
1599-
fetch_act_order_scales_to_shared(false, first_group_id, last_group_id);
1600-
__syncthreads();
1591+
1592+
if (slice_k_start < prob_k) {
1593+
slice_k_start_shared_fetch += tb_k * stages;
1594+
int first_group_id = g_idx[slice_k_start];
1595+
int last_g_idx = slice_k_start + stages * tb_k * 2;
1596+
if (last_g_idx >= prob_k) {
1597+
last_g_idx = prob_k - 1;
1598+
}
1599+
int last_group_id = g_idx[last_g_idx];
1600+
if (last_group_id >= sh_first_group_id + sh_num_groups) {
1601+
fetch_act_order_scales_to_shared(false, first_group_id,
1602+
last_group_id);
1603+
__syncthreads();
1604+
}
16011605
}
16021606
}
16031607

docker/Dockerfile

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
7777
# can be useful for both `dev` and `test`
7878
# explicitly set the list to avoid issues with torch 2.2
7979
# see https://github.com/pytorch/pytorch/pull/123243
80-
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
80+
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
8181
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
8282
# Override the arch list for flash-attn to reduce the binary size
8383
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
@@ -255,9 +255,15 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
255255
RUN --mount=type=cache,target=/root/.cache/uv \
256256
. /etc/environment && \
257257
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
258+
# uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.4/flashinfer_python-0.2.4+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
258259
# TESTING: install FlashInfer from source to test 2.7.0 final RC
259-
FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \
260-
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \
260+
if [[ "$CUDA_VERSION" == 12.8* ]]; then \
261+
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'; \
262+
else \
263+
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
264+
fi && \
265+
export FLASHINFER_ENABLE_AOT=1; \
266+
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
261267
fi
262268
COPY examples examples
263269
COPY benchmarks benchmarks

docker/Dockerfile.ppc64le

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,8 @@ ENV UV_LINK_MODE=copy
2121
# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
2222
# when `--jobs=<N>` is passed with podman build command
2323
RUN microdnf install -y openssl-devel dnf \
24-
&& dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
25-
https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
26-
https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
27-
&& dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os \
28-
&& dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/AppStream/`arch`/os \
29-
&& dnf config-manager --set-enabled crb \
24+
&& dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
25+
&& dnf config-manager --set-enabled codeready-builder-for-rhel-9-ppc64le-rpms \
3026
&& dnf install -y \
3127
git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
3228
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \

docs/source/deployment/frameworks/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ chatbox
1010
dify
1111
dstack
1212
helm
13+
litellm
1314
lobe-chat
1415
lws
1516
modal
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
(deployment-litellm)=
2+
3+
# LiteLLM
4+
5+
[LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
6+
7+
LiteLLM manages:
8+
9+
- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
10+
- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
11+
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
12+
- Set Budgets & Rate limits per project, api key, model [LiteLLM Proxy Server (LLM Gateway)](https://docs.litellm.ai/docs/simple_proxy)
13+
14+
And LiteLLM supports all models on VLLM.
15+
16+
## Prerequisites
17+
18+
- Setup vLLM and litellm environment
19+
20+
```console
21+
pip install vllm litellm
22+
```
23+
24+
## Deploy
25+
26+
### Chat completion
27+
28+
- Start the vLLM server with the supported chat completion model, e.g.
29+
30+
```console
31+
vllm serve qwen/Qwen1.5-0.5B-Chat
32+
```
33+
34+
- Call it with litellm:
35+
36+
```python
37+
import litellm
38+
39+
messages = [{ "content": "Hello, how are you?","role": "user"}]
40+
41+
# hosted_vllm is prefix key word and necessary
42+
response = litellm.completion(
43+
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
44+
messages=messages,
45+
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
46+
temperature=0.2,
47+
max_tokens=80)
48+
49+
print(response)
50+
```
51+
52+
### Embeddings
53+
54+
- Start the vLLM server with the supported embedding model, e.g.
55+
56+
```console
57+
vllm serve BAAI/bge-base-en-v1.5
58+
```
59+
60+
- Call it with litellm:
61+
62+
```python
63+
from litellm import embedding
64+
import os
65+
66+
os.environ["HOSTED_VLLM_API_BASE"] = "http://{your-vllm-server-host}:{your-vllm-server-port}/v1"
67+
68+
# hosted_vllm is prefix key word and necessary
69+
# pass the vllm model name
70+
embedding = embedding(model="hosted_vllm/BAAI/bge-base-en-v1.5", input=["Hello world"])
71+
72+
print(embedding)
73+
```
74+
75+
For details, see the tutorial [Using vLLM in LiteLLM](https://docs.litellm.ai/docs/providers/vllm).

docs/source/features/prompt_embeds.md

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# Prompt Embedding Inputs
2+
3+
This page teaches you how to pass prompt embedding inputs to vLLM.
4+
5+
## What are prompt embeddings?
6+
7+
The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary.
8+
9+
:::{note}
10+
Prompt embeddings are currently only supported in the v0 engine.
11+
:::
12+
13+
## Offline Inference
14+
15+
To input multi-modal data, follow this schema in {class}`vllm.inputs.EmbedsPrompt`:
16+
17+
- `prompt_embeds`: A torch tensor representing a sequence of prompt/token embeddings. This has the shape (sequence_length, hidden_size), where sequence length is the number of tokens embeddings and hidden_size is the hidden size (embedding size) of the model.
18+
19+
### Hugging Face Transformers Inputs
20+
21+
You can pass prompt embeddings from Hugging Face Transformers models to the `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
22+
23+
```python
24+
from vllm import LLM
25+
import transformers
26+
27+
model_name = "meta-llama/Llama-3.2-1B-Instruct"
28+
29+
# Transformers
30+
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
31+
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
32+
33+
llm = LLM(model=model_name, enable_prompt_embeds=True)
34+
35+
# Refer to the HuggingFace repo for the correct format to use
36+
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
37+
token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
38+
39+
embedding_layer = transformers_model.get_input_embeddings()
40+
prompt_embeds = embedding_layer(token_ids).squeeze(0)
41+
42+
# Single prompt inference
43+
outputs = llm.generate({
44+
"prompt_embeds": prompt_embeds,
45+
})
46+
47+
for o in outputs:
48+
generated_text = o.outputs[0].text
49+
print(generated_text)
50+
51+
# Batch inference
52+
53+
chats = [
54+
[{"role": "user", "content": "Please tell me about the capital of France."}],
55+
[{"role": "user", "content": "When is the day longest during the year?"}],
56+
[{"role": "user", "content": "Where is bigger, the moon or the sun?"}]
57+
]
58+
59+
token_ids_list = [
60+
tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') for chat in chats
61+
]
62+
prompt_embeds_list = [embedding_layer(token_ids).squeeze(0) for token_ids in token_ids_list]
63+
64+
outputs = llm.generate(
65+
[
66+
{
67+
"prompt_embeds": prompt_embeds,
68+
} for prompt_embeds in prompt_embeds_list
69+
]
70+
)
71+
72+
for o in outputs:
73+
generated_text = o.outputs[0].text
74+
print(generated_text)
75+
```
76+
77+
## Online Serving
78+
79+
Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
80+
81+
When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
82+
83+
Prompt embeddings are passed in as base64 encoded torch tensors.
84+
85+
### Transformers Inputs via OpenAI Client
86+
87+
First, launch the OpenAI-compatible server:
88+
89+
```bash
90+
vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \
91+
--max-model-len 4096 --enable-prompt-embeds
92+
```
93+
94+
Then, you can use the OpenAI client as follows:
95+
96+
```python
97+
from openai import OpenAI
98+
import transformers
99+
import torch
100+
101+
openai_api_key = "EMPTY"
102+
openai_api_base = "http://localhost:8000/v1"
103+
104+
client = OpenAI(
105+
api_key=openai_api_key,
106+
base_url=openai_api_base,
107+
)
108+
109+
model_name = "meta-llama/Llama-3.2-1B-Instruct"
110+
111+
# Transformers
112+
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
113+
transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
114+
115+
116+
# Refer to the HuggingFace repo for the correct format to use
117+
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
118+
token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
119+
120+
embedding_layer = transformers_model.get_input_embeddings()
121+
prompt_embeds = embedding_layer(token_ids).squeeze(0)
122+
123+
# Prompt embeddings
124+
buffer = io.BytesIO()
125+
torch.save(prompt_embeds, buffer)
126+
buffer.seek(0)
127+
binary_data = buffer.read()
128+
encoded_embeds = base64.b64encode(binary_data).decode('utf-8')
129+
130+
131+
completion = client_with_prompt_embeds.completions.create(
132+
model=model_name,
133+
# NOTE: The OpenAI client does not allow `None` as an input to
134+
# `prompt`. Use an empty string if you have no text prompts.
135+
prompt="",
136+
max_tokens=5,
137+
temperature=0.0,
138+
# NOTE: The OpenAI client allows passing in extra JSON body via the
139+
# `extra_body` argument.
140+
extra_body={"prompt_embeds": encoded_embeds}
141+
)
142+
143+
print(completion.choices[0].text)
144+
```

0 commit comments

Comments
 (0)