ROCm
diff --git a/‎.buildkite/release-pipeline.yaml
Lines changed: 2 additions & 2 deletions b/‎.buildkite/release-pipeline.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/mergify.yml
Lines changed: 11 additions & 0 deletions b/‎.github/mergify.yml
Lines changed: 11 additions & 0 deletions
diff --git a/‎csrc/moe/marlin_moe_wna16/marlin_template.h
Lines changed: 14 additions & 11 deletions b/‎csrc/moe/marlin_moe_wna16/marlin_template.h
Lines changed: 14 additions & 11 deletions
diff --git a/‎csrc/quantization/gptq_marlin/marlin_template.h
Lines changed: 14 additions & 10 deletions b/‎csrc/quantization/gptq_marlin/marlin_template.h
Lines changed: 14 additions & 10 deletions
diff --git a/‎docker/Dockerfile
Lines changed: 9 additions & 3 deletions b/‎docker/Dockerfile
Lines changed: 9 additions & 3 deletions
diff --git a/‎docker/Dockerfile.ppc64le
Lines changed: 2 additions & 6 deletions b/‎docker/Dockerfile.ppc64le
Lines changed: 2 additions & 6 deletions
diff --git a/‎docs/source/deployment/frameworks/index.md
Lines changed: 1 addition & 0 deletions b/‎docs/source/deployment/frameworks/index.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/deployment/frameworks/litellm.md
Lines changed: 75 additions & 0 deletions b/‎docs/source/deployment/frameworks/litellm.md
Lines changed: 75 additions & 0 deletions
diff --git a/‎docs/source/serving/multimodal_inputs.md renamed to ‎docs/source/features/multimodal_inputs.md b/‎docs/source/serving/multimodal_inputs.md renamed to ‎docs/source/features/multimodal_inputs.md
diff --git a/‎docs/source/features/prompt_embeds.md
Lines changed: 144 additions & 0 deletions b/‎docs/source/features/prompt_embeds.md
Lines changed: 144 additions & 0 deletions
@@ -14,7 +14,7 @@ steps:
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -31,7 +31,7 @@ steps:
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
 
@@ -163,6 +163,17 @@ pull_request_rules:
 
        https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
 
+- name: assign reviewer for tensorizer changes
+  conditions:
+      - files~=^vllm/model_executor/model_loader/tensorizer.py
+      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
+      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
+      - files~=^tests/tensorizer_loader/
+  actions:
+    assign:
+      users:
+        - "sangstar"
+
 - name: remove 'needs-rebase' label when conflict is resolved
   conditions:
       - -conflict
 
@@ -1767,17 +1767,20 @@ __global__ void Marlin(
 
       if constexpr (has_act_order) {
         slice_k_start += tb_k * stages;
-        slice_k_start_shared_fetch += tb_k * stages;
-        int first_group_id = g_idx[slice_k_start];
-        int last_g_idx = slice_k_start + stages * tb_k * 2;
-        if (last_g_idx >= prob_k) {
-          last_g_idx = prob_k - 1;
-        }
-        int last_group_id = g_idx[last_g_idx];
-        if (last_group_id >= sh_first_group_id + sh_num_groups) {
-          fetch_act_order_scales_to_shared(false, first_group_id,
-                                           last_group_id);
-          __syncthreads();
+
+        if (slice_k_start < prob_k) {
+          slice_k_start_shared_fetch += tb_k * stages;
+          int first_group_id = g_idx[slice_k_start];
+          int last_g_idx = slice_k_start + stages * tb_k * 2;
+          if (last_g_idx >= prob_k) {
+            last_g_idx = prob_k - 1;
+          }
+          int last_group_id = g_idx[last_g_idx];
+          if (last_group_id >= sh_first_group_id + sh_num_groups) {
+            fetch_act_order_scales_to_shared(false, first_group_id,
+                                             last_group_id);
+            __syncthreads();
+          }
         }
       }
       if (slice_iters == 0) {
 
@@ -1588,16 +1588,20 @@ __global__ void Marlin(
 
     if constexpr (has_act_order) {
       slice_k_start += tb_k * stages;
-      slice_k_start_shared_fetch += tb_k * stages;
-      int first_group_id = g_idx[slice_k_start];
-      int last_g_idx = slice_k_start + stages * tb_k * 2;
-      if (last_g_idx >= prob_k) {
-        last_g_idx = prob_k - 1;
-      }
-      int last_group_id = g_idx[last_g_idx];
-      if (last_group_id >= sh_first_group_id + sh_num_groups) {
-        fetch_act_order_scales_to_shared(false, first_group_id, last_group_id);
-        __syncthreads();
+
+      if (slice_k_start < prob_k) {
+        slice_k_start_shared_fetch += tb_k * stages;
+        int first_group_id = g_idx[slice_k_start];
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        int last_group_id = g_idx[last_g_idx];
+        if (last_group_id >= sh_first_group_id + sh_num_groups) {
+          fetch_act_order_scales_to_shared(false, first_group_id,
+                                           last_group_id);
+          __syncthreads();
+        }
       }
     }
 
 
@@ -77,7 +77,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
 # see https://github.com/pytorch/pytorch/pull/123243
-ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # Override the arch list for flash-attn to reduce the binary size
 ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
@@ -255,9 +255,15 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
+    # uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.4/flashinfer_python-0.2.4+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
     # TESTING: install FlashInfer from source to test 2.7.0 final RC
-    FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \
-    uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \
+    if [[ "$CUDA_VERSION" == 12.8* ]]; then \
+        export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'; \
+    else \
+        export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
+    fi && \
+    export FLASHINFER_ENABLE_AOT=1; \
+    uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
 fi
 COPY examples examples
 COPY benchmarks benchmarks
 
@@ -21,12 +21,8 @@ ENV UV_LINK_MODE=copy
 # Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
 #       when `--jobs=<N>` is passed with podman build command
 RUN microdnf install -y openssl-devel dnf \
-    && dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
-        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
-        https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
-    && dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os \
-    && dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/AppStream/`arch`/os \
-    && dnf config-manager --set-enabled crb \
+    && dnf install -y  https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
+    && dnf config-manager --set-enabled codeready-builder-for-rhel-9-ppc64le-rpms \
     && dnf install -y \
        git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
        pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
 
@@ -10,6 +10,7 @@ chatbox
 dify
 dstack
 helm
+litellm
 lobe-chat
 lws
 modal
 
@@ -0,0 +1,75 @@
+(deployment-litellm)=
+
+# LiteLLM
+
+[LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
+
+LiteLLM manages:
+
+- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints
+- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']`
+- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
+- Set Budgets & Rate limits per project, api key, model [LiteLLM Proxy Server (LLM Gateway)](https://docs.litellm.ai/docs/simple_proxy)
+
+And LiteLLM supports all models on VLLM.
+
+## Prerequisites
+
+- Setup vLLM and litellm environment
+
+```console
+pip install vllm litellm
+```
+
+## Deploy
+
+### Chat completion
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```console
+vllm serve qwen/Qwen1.5-0.5B-Chat
+```
+
+- Call it with litellm:
+
+```python
+import litellm 
+
+messages = [{ "content": "Hello, how are you?","role": "user"}]
+
+# hosted_vllm is prefix key word and necessary
+response = litellm.completion(
+            model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
+            messages=messages,
+            api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
+            temperature=0.2,
+            max_tokens=80)
+
+print(response)
+```
+
+### Embeddings
+
+- Start the vLLM server with the supported embedding model, e.g.
+
+```console
+vllm serve BAAI/bge-base-en-v1.5
+```
+
+- Call it with litellm:
+
+```python
+from litellm import embedding   
+import os
+
+os.environ["HOSTED_VLLM_API_BASE"] = "http://{your-vllm-server-host}:{your-vllm-server-port}/v1"
+
+# hosted_vllm is prefix key word and necessary
+# pass the vllm model name
+embedding = embedding(model="hosted_vllm/BAAI/bge-base-en-v1.5", input=["Hello world"])
+
+print(embedding)
+```
+
+For details, see the tutorial [Using vLLM in LiteLLM](https://docs.litellm.ai/docs/providers/vllm).
@@ -0,0 +1,144 @@
+# Prompt Embedding Inputs
+
+This page teaches you how to pass prompt embedding inputs to vLLM.
+
+## What are prompt embeddings?
+
+The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary.
+
+:::{note}
+Prompt embeddings are currently only supported in the v0 engine.
+:::
+
+## Offline Inference
+
+To input multi-modal data, follow this schema in {class}`vllm.inputs.EmbedsPrompt`:
+
+- `prompt_embeds`: A torch tensor representing a sequence of prompt/token embeddings. This has the shape (sequence_length, hidden_size), where sequence length is the number of tokens embeddings and hidden_size is the hidden size (embedding size) of the model.
+
+### Hugging Face Transformers Inputs
+
+You can pass prompt embeddings from Hugging Face Transformers models to the  `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
+
+```python
+from vllm import LLM
+import transformers
+
+model_name = "meta-llama/Llama-3.2-1B-Instruct"
+
+# Transformers
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
+
+llm = LLM(model=model_name, enable_prompt_embeds=True)
+
+# Refer to the HuggingFace repo for the correct format to use
+chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
+token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
+
+embedding_layer = transformers_model.get_input_embeddings()
+prompt_embeds = embedding_layer(token_ids).squeeze(0)
+
+# Single prompt inference
+outputs = llm.generate({
+    "prompt_embeds": prompt_embeds,
+})
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+
+# Batch inference
+
+chats = [
+    [{"role": "user", "content": "Please tell me about the capital of France."}],
+    [{"role": "user", "content": "When is the day longest during the year?"}],
+    [{"role": "user", "content": "Where is bigger, the moon or the sun?"}]
+]
+
+token_ids_list = [
+    tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') for chat in chats
+]
+prompt_embeds_list = [embedding_layer(token_ids).squeeze(0) for token_ids in token_ids_list]
+
+outputs = llm.generate(
+    [
+        {
+            "prompt_embeds": prompt_embeds,
+        } for prompt_embeds in prompt_embeds_list
+    ]
+)
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
+## Online Serving
+
+Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
+
+When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
+
+Prompt embeddings are passed in as base64 encoded torch tensors.
+
+### Transformers Inputs via OpenAI Client
+
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \
+  --max-model-len 4096 --enable-prompt-embeds
+```
+
+Then, you can use the OpenAI client as follows:
+
+```python
+from openai import OpenAI
+import transformers
+import torch
+
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+model_name = "meta-llama/Llama-3.2-1B-Instruct"
+
+# Transformers
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
+
+
+# Refer to the HuggingFace repo for the correct format to use
+chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
+token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
+
+embedding_layer = transformers_model.get_input_embeddings()
+prompt_embeds = embedding_layer(token_ids).squeeze(0)
+
+# Prompt embeddings
+buffer = io.BytesIO()
+torch.save(prompt_embeds, buffer)
+buffer.seek(0)
+binary_data = buffer.read()
+encoded_embeds = base64.b64encode(binary_data).decode('utf-8')
+
+
+completion = client_with_prompt_embeds.completions.create(
+    model=model_name,
+    # NOTE: The OpenAI client does not allow `None` as an input to 
+    # `prompt`. Use an empty string if you have no text prompts.
+    prompt="",  
+    max_tokens=5,
+    temperature=0.0,
+    # NOTE: The OpenAI client allows passing in extra JSON body via the
+    # `extra_body` argument.
+    extra_body={"prompt_embeds": encoded_embeds}
+)
+
+print(completion.choices[0].text)
+```
-Original file line number
+Diff line change
 dify
 dstack
 helm
 +litellm
 lobe-chat
 lws
 modal