vllm-project
diff --git a/‎.buildkite/nightly-benchmarks/nightly-annotation.md
Lines changed: 1 addition & 1 deletion b/‎.buildkite/nightly-benchmarks/nightly-annotation.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/release-pipeline.yaml
Lines changed: 2 additions & 0 deletions b/‎.buildkite/release-pipeline.yaml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.buildkite/scripts/tpu/config_v6e_1.env
Lines changed: 2 additions & 2 deletions b/‎.buildkite/scripts/tpu/config_v6e_1.env
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/mergify.yml
Lines changed: 14 additions & 1 deletion b/‎.github/mergify.yml
Lines changed: 14 additions & 1 deletion
diff --git a/‎.pre-commit-config.yaml
Lines changed: 5 additions & 0 deletions b/‎.pre-commit-config.yaml
Lines changed: 5 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 2 additions & 0 deletions b/‎README.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmarks/README.md
Lines changed: 190 additions & 0 deletions b/‎benchmarks/README.md
Lines changed: 190 additions & 0 deletions
@@ -16,7 +16,7 @@ Please download the visualization scripts in the post
   - Download `nightly-benchmarks.zip`.
   - In the same folder, run the following code:
 
-  ```console
+  ```bash
   export HF_TOKEN=<your HF token>
   apt update
   apt install -y git
 
@@ -102,6 +102,7 @@ steps:
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
@@ -117,6 +118,7 @@ steps:
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
@@ -4,8 +4,8 @@ CONTAINER_NAME=vllm-tpu
 
 # vllm config
 MODEL=meta-llama/Llama-3.1-8B-Instruct
-MAX_NUM_SEQS=512
-MAX_NUM_BATCHED_TOKENS=512
+MAX_NUM_SEQS=256
+MAX_NUM_BATCHED_TOKENS=1024
 TENSOR_PARALLEL_SIZE=1
 MAX_MODEL_LEN=2048
 DOWNLOAD_DIR=/mnt/disks/persist
 
@@ -45,6 +45,7 @@ pull_request_rules:
       - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
       - files~=^vllm/model_executor/models/.*llama.*\.py
       - files~=^vllm/transformers_utils/configs/.*llama.*\.py
+      - title~=(?i)llama
   actions:
     label:
       add:
@@ -65,6 +66,19 @@ pull_request_rules:
       add:
         - multi-modality
 
+- name: label-performance
+  description: Automatically apply performance label
+  conditions:
+    - or:
+      - files~=^benchmarks/
+      - files~=^vllm/benchmarks/
+      - files~=^tests/benchmarks/
+      - files~=^\.buildkite/nightly-benchmarks/
+  actions:
+    label:
+      add:
+        - performance
+
 - name: label-qwen
   description: Automatically apply qwen label
   conditions:
@@ -74,7 +88,6 @@ pull_request_rules:
       - files~=^vllm/model_executor/models/.*qwen.*\.py
       - files~=^vllm/reasoning/.*qwen.*\.py
       - title~=(?i)Qwen
-      - body~=(?i)Qwen
   actions:
     label:
       add:
 
@@ -115,6 +115,11 @@ repos:
     entry: python tools/check_spdx_header.py
     language: python
     types: [python]
+  - id: check-root-lazy-imports
+    name: Check root lazy imports
+    entry: python tools/check_init_lazy_imports.py
+    language: python
+    types: [python]
   - id: check-filenames
     name: Check for spaces in all filenames
     entry: bash
 
@@ -154,11 +154,13 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 
 ## Contact Us
 
+<!-- --8<-- [start:contact-us] -->
 - For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
 - For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
 - For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
 - For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
 - For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
+<!-- --8<-- [end:contact-us] -->
 
 ## Media Kit
 
 
@@ -269,6 +269,21 @@ python3 vllm/benchmarks/benchmark_serving.py \
   --num-prompts 10
 ```
 
+### Running With Ramp-Up Request Rate
+
+The benchmark tool also supports ramping up the request rate over the
+duration of the benchmark run. This can be useful for stress testing the
+server or finding the maximum throughput that it can handle, given some latency budget.
+
+Two ramp-up strategies are supported:
+- `linear`: Increases the request rate linearly from a start value to an end value.
+- `exponential`: Increases the request rate exponentially.
+
+The following arguments can be used to control the ramp-up:
+- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
+- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
+- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
+
 ---
 ## Example - Offline Throughput Benchmark
 
@@ -387,3 +402,178 @@ python3 vllm/benchmarks/benchmark_throughput.py \
   --enable-lora \
   --lora-path yard1/llama-2-7b-sql-lora-test
   ```
+
+---
+## Example - Structured Output Benchmark
+
+Benchmark the performance of structured output generation (JSON, grammar, regex).
+
+### Server Setup
+
+```bash
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
+```
+
+### JSON Schema Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset json \
+  --structured-output-ratio 1.0 \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+### Grammar-based Generation Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset grammar \
+  --structure-type grammar \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+### Regex-based Generation Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset regex \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+### Choice-based Generation Benchmark
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset choice \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+### XGrammar Benchmark Dataset
+
+```bash
+python3 benchmarks/benchmark_serving_structured_output.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset xgrammar_bench \
+  --request-rate 10 \
+  --num-prompts 1000
+```
+
+---
+## Example - Long Document QA Throughput Benchmark
+
+Benchmark the performance of long document question-answering with prefix caching.
+
+### Basic Long Document QA Test
+
+```bash
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 16 \
+  --document-length 2000 \
+  --output-len 50 \
+  --repeat-count 5
+```
+
+### Different Repeat Modes
+
+```bash
+# Random mode (default) - shuffle prompts randomly
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode random
+
+# Tile mode - repeat entire prompt list in sequence
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode tile
+
+# Interleave mode - repeat each prompt consecutively
+python3 benchmarks/benchmark_long_document_qa_throughput.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-documents 8 \
+  --document-length 3000 \
+  --repeat-count 3 \
+  --repeat-mode interleave
+```
+
+---
+## Example - Prefix Caching Benchmark
+
+Benchmark the efficiency of automatic prefix caching.
+
+### Fixed Prompt with Prefix Caching
+
+```bash
+python3 benchmarks/benchmark_prefix_caching.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --enable-prefix-caching \
+  --num-prompts 1 \
+  --repeat-count 100 \
+  --input-length-range 128:256
+```
+
+### ShareGPT Dataset with Prefix Caching
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+python3 benchmarks/benchmark_prefix_caching.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --enable-prefix-caching \
+  --num-prompts 20 \
+  --repeat-count 5 \
+  --input-length-range 128:256
+```
+
+---
+## Example - Request Prioritization Benchmark
+
+Benchmark the performance of request prioritization in vLLM.
+
+### Basic Prioritization Test
+
+```bash
+python3 benchmarks/benchmark_prioritization.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --input-len 128 \
+  --output-len 64 \
+  --num-prompts 100 \
+  --scheduling-policy priority
+```
+
+### Multiple Sequences per Prompt
+
+```bash
+python3 benchmarks/benchmark_prioritization.py \
+  --model meta-llama/Llama-2-7b-chat-hf \
+  --input-len 128 \
+  --output-len 64 \
+  --num-prompts 100 \
+  --scheduling-policy priority \
+  --n 2
+```