Skip to content

Commit ba4e905

Browse files
authored
Merge branch 'main' into fused_moe_lora
2 parents 7f23672 + 32142b3 commit ba4e905

File tree

346 files changed

+11532
-19622
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

346 files changed

+11532
-19622
lines changed

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@ fi
108108
if [[ $commands == *" kernels/attention"* ]]; then
109109
commands="${commands} \
110110
--ignore=kernels/attention/test_attention_selector.py \
111-
--ignore=kernels/attention/test_blocksparse_attention.py \
112111
--ignore=kernels/attention/test_encoder_decoder_attn.py \
113112
--ignore=kernels/attention/test_flash_attn.py \
114113
--ignore=kernels/attention/test_flashinfer.py \

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ set -ex
66

77
# allow to bind to different cores
88
CORE_RANGE=${CORE_RANGE:-48-95}
9+
# used for TP/PP E2E test
910
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
1011
NUMA_NODE=${NUMA_NODE:-1}
1112

@@ -24,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
2425
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
2526

2627
# Run the image, setting --shm-size=4g for tensor parallel.
27-
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28-
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
28+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
29+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
2930

3031
function cpu_tests() {
3132
set -e
@@ -78,17 +79,16 @@ function cpu_tests() {
7879
# tests/quantization/test_ipex_quant.py"
7980

8081
# online serving
81-
docker exec cpu-test-"$NUMA_NODE" bash -c "
82+
docker exec cpu-test-"$NUMA_NODE" bash -c '
8283
set -e
83-
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
84-
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
85-
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
84+
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
85+
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
86+
python3 benchmarks/benchmark_serving.py \
8687
--backend vllm \
8788
--dataset-name random \
88-
--model facebook/opt-125m \
89+
--model meta-llama/Llama-3.2-3B-Instruct \
8990
--num-prompts 20 \
90-
--endpoint /v1/completions \
91-
--tokenizer facebook/opt-125m"
91+
--endpoint /v1/completions'
9292

9393
# Run multi-lora tests
9494
docker exec cpu-test-"$NUMA_NODE" bash -c "

.buildkite/test-pipeline.yaml

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,6 @@ steps:
159159
- tests/distributed/test_utils
160160
- tests/distributed/test_pynccl
161161
- tests/distributed/test_events
162-
- tests/spec_decode/e2e/test_integration_dist_tp4
163162
- tests/compile/test_basic_correctness
164163
- examples/offline_inference/rlhf.py
165164
- examples/offline_inference/rlhf_colocate.py
@@ -182,7 +181,6 @@ steps:
182181
- pytest -v -s compile/test_basic_correctness.py
183182
- pytest -v -s distributed/test_pynccl.py
184183
- pytest -v -s distributed/test_events.py
185-
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
186184
# TODO: create a dedicated test section for multi-GPU example tests
187185
# when we have multiple distributed example tests
188186
- pushd ../examples/offline_inference
@@ -266,6 +264,7 @@ steps:
266264
- pytest -v -s v1/structured_output
267265
- pytest -v -s v1/spec_decode
268266
- pytest -v -s v1/kv_connector/unit
267+
- pytest -v -s v1/metrics
269268
- pytest -v -s v1/test_serial_utils.py
270269
- pytest -v -s v1/test_utils.py
271270
- pytest -v -s v1/test_oracle.py
@@ -274,7 +273,7 @@ steps:
274273
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
275274
- pytest -v -s v1/e2e
276275
# Integration test for streaming correctness (requires special branch).
277-
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
276+
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
278277
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
279278

280279
- label: Examples Test # 25min
@@ -330,17 +329,6 @@ steps:
330329
- pytest -v -s samplers
331330
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
332331

333-
- label: Speculative decoding tests # 40min
334-
mirror_hardwares: [amdexperimental]
335-
source_file_dependencies:
336-
- vllm/spec_decode
337-
- tests/spec_decode
338-
- vllm/model_executor/models/eagle.py
339-
commands:
340-
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
341-
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
342-
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
343-
344332
- label: LoRA Test %N # 15min each
345333
mirror_hardwares: [amdexperimental, amdproduction]
346334
source_file_dependencies:
@@ -726,7 +714,6 @@ steps:
726714
- pytest -v -s distributed/test_sequence_parallel.py
727715
# this test fails consistently.
728716
# TODO: investigate and fix
729-
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
730717
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
731718
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
732719
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown

.github/CODEOWNERS

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
/vllm/lora @jeejeelee
1717
/vllm/reasoning @aarnphm
1818
/vllm/entrypoints @aarnphm
19-
/vllm/compilation @zou3519 @youkaichao
19+
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
2020
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
2121

2222
# Any change to the VllmConfig changes can have a large user-facing impact,
@@ -43,7 +43,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
4343
/tests/multimodal @DarkLight1337 @ywang96
4444
/tests/prefix_caching @comaniac @KuntaiDu
4545
/tests/quantization @mgoin @robertgshaw2-redhat
46-
/tests/spec_decode @njhill @LiuXiaoxuanPKU
4746
/tests/test_inputs.py @DarkLight1337 @ywang96
4847
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
4948
/tests/v1/structured_output @mgoin @russellb @aarnphm

.github/ISSUE_TEMPLATE/750-RFC.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ body:
4646
- type: markdown
4747
attributes:
4848
value: >
49-
Thanks for contributing 🎉!
49+
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
5050
- type: checkboxes
5151
id: askllm
5252
attributes:

.github/mergify.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,10 +164,7 @@ pull_request_rules:
164164
description: Automatically apply speculative-decoding label
165165
conditions:
166166
- or:
167-
- files~=^vllm/spec_decode/
168167
- files~=^vllm/v1/spec_decode/
169-
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
170-
- files~=^tests/spec_decode/
171168
- files~=^tests/v1/spec_decode/
172169
- files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
173170
- files~=^vllm/model_executor/models/.*eagle.*\.py

RELEASE.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,36 @@ After branch cut, we approach finalizing the release branch with clear criteria
5252
* Release branch specific changes (e.g. change version identifiers or CI fixes)
5353

5454
Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
55+
56+
## Manual validations
57+
58+
### E2E Performance Validation
59+
60+
Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
61+
62+
**Current Coverage:**
63+
* Models: Llama3, Llama4, and Mixtral
64+
* Hardware: NVIDIA H100 and AMD MI300x
65+
* *Note: Coverage may change based on new model releases and hardware availability*
66+
67+
**Performance Validation Process:**
68+
69+
**Step 1: Get Access**
70+
Request write access to the [pytorch/pytorch-integration-testing](https://github.com/pytorch/pytorch-integration-testing) repository to run the benchmark workflow.
71+
72+
**Step 2: Review Benchmark Setup**
73+
Familiarize yourself with the benchmark configurations:
74+
* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
75+
* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
76+
77+
**Step 3: Run the Benchmark**
78+
Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
79+
* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
80+
* **vLLM commit**: Set to the RC commit hash
81+
82+
**Step 4: Review Results**
83+
Once the workflow completes, benchmark results will be available on the [vLLM benchmark dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) under the corresponding branch and commit.
84+
85+
**Step 5: Performance Comparison**
86+
Compare the current results against the previous release to verify no performance regressions have occurred. Here is an
87+
example of [v0.9.1 vs v0.9.2](https://hud.pytorch.org/benchmark/llms?startTime=Thu%2C%2017%20Apr%202025%2021%3A43%3A50%20GMT&stopTime=Wed%2C%2016%20Jul%202025%2021%3A43%3A50%20GMT&granularity=week&lBranch=releases/v0.9.1&lCommit=b6553be1bc75f046b00046a4ad7576364d03c835&rBranch=releases/v0.9.2&rCommit=a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f&repoName=vllm-project%2Fvllm&benchmarkName=&modelName=All%20Models&backendName=All%20Backends&modeName=All%20Modes&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms).

benchmarks/auto_tune/README.md

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# Automated vLLM Server Parameter Tuning
2+
3+
This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
4+
5+
## Table of Contents
6+
- [Prerequisites](#prerequisites)
7+
- [Configuration](#configuration)
8+
- [How to Run](#how-to-run)
9+
- [Example Use Cases](#example-use-cases)
10+
- [Output](#output)
11+
- [How It Works](#how-it-works)
12+
13+
## Prerequisites
14+
15+
Before running the script, please ensure the following steps are completed:
16+
17+
1. **Clone vLLM & Set Up Branch**: Clone the vLLM repository and check out to your desired branch.
18+
19+
```bash
20+
git clone https://github.com/vllm-project/vllm.git
21+
cd vllm
22+
# git checkout <your-branch>
23+
```
24+
25+
1. **Install Environment**: Install or update the correct running environment. For TPU usage, activate your `conda` environment and install the corresponding `torch` and `torch_xla` versions.
26+
27+
2. **Model Configuration**: If you are using a customized model, ensure its configuration files are correctly placed and accessible.
28+
29+
## Configuration
30+
31+
You must set the following variables at the top of the script before execution.
32+
33+
| Variable | Description | Example Value |
34+
| --- | --- | --- |
35+
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
36+
| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
37+
| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
38+
| `TP` | **Required.** The tensor-parallelism size. | `1` |
39+
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
40+
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
41+
| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
42+
| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
43+
| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
44+
| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
45+
| `NUM_BATCHED_TOKENS_LIST` | A space-separated string of `max-num-batched-tokens` values to test. | `"1024 2048 4096"` |
46+
47+
**Note**: The default `NUM_SEQS_LIST` and `NUM_BATCHED_TOKENS_LIST` are set for medium-sized inputs/outputs. For very short contexts (e.g., 20 input, 20 output tokens), you may need to test larger values for `max-num-seqs`.
48+
49+
## How to Run
50+
51+
1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
52+
2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
53+
54+
```
55+
cd <FOLDER_OF_THIS_SCRIPT>
56+
bash auto_tune.sh
57+
```
58+
59+
Please note that the `bash auto_tune.sh` command cannot contain full or partial path with keyword `vllm`, otherwise `pkill -f vllm` command will also kill this script itself.
60+
61+
## Example Use Cases
62+
63+
Here are a few examples of how to configure the script for different goals:
64+
65+
### 1. Maximize Throughput (No Latency Constraint)
66+
- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
67+
- **Configuration**:
68+
69+
```bash
70+
INPUT_LEN=1800
71+
OUTPUT_LEN=20
72+
MIN_CACHE_HIT_PCT=0
73+
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
74+
```
75+
76+
#### 2. Maximize Throughput with a Latency Requirement
77+
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
78+
- **Configuration**:
79+
80+
```bash
81+
INPUT_LEN=1800
82+
OUTPUT_LEN=20
83+
MIN_CACHE_HIT_PCT=0
84+
MAX_LATENCY_ALLOWED_MS=500
85+
```
86+
87+
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
88+
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
89+
- **Configuration**:
90+
91+
```bash
92+
INPUT_LEN=1800
93+
OUTPUT_LEN=20
94+
MIN_CACHE_HIT_PCT=60
95+
MAX_LATENCY_ALLOWED_MS=500
96+
```
97+
98+
## Output
99+
100+
After the script finishes, you will find the results in a new, timestamped directory created inside `$BASE/auto-benchmark/`.
101+
102+
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
103+
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
104+
- `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run.
105+
106+
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
107+
108+
```
109+
# Example result.txt content
110+
hash:a1b2c3d4...
111+
max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
112+
max_num_seqs: 128, max_num_batched_tokens: 4096 does not meet latency requirement 500
113+
...
114+
best_max_num_seqs: 256, best_num_batched_tokens: 2048, best_throughput: 12.5, profile saved in: /home/user/vllm/auto-benchmark/2024_08_01_10_30/profile
115+
```
116+
117+
If it cannot find the best parameters, the final row will be `best_max_num_seqs: 0, best_num_batched_tokens: 0, best_throughput: 0`. This can be due to either the server not starting properly, or the latency requirement being too strict.
118+
119+
- **Profiler Trace**: A directory named `profile` is created inside the log directory. It contains the profiler trace file (e.g., `.xplane.pb` for TPU or a `.json` trace for GPU) from the single best-performing run.
120+
121+
## How It Works
122+
123+
The script follows a systematic process to find the optimal parameters:
124+
125+
1. **Find Max GPU Memory Utilization**: The script first determines the highest safe `gpu-memory-utilization` (starting from 0.98 and decreasing) that does not cause an Out-Of-Memory (OOM) error when launching the server. This ensures the benchmark runs use the maximum available memory without crashing.
126+
127+
2. **Iterate and Benchmark**: It then enters a nested loop, iterating through every combination of `max-num-seqs` and `max-num-batched-tokens` provided in the configuration lists.
128+
129+
3. **Latency-Aware Throughput Search**: For each parameter combination:
130+
- The vLLM server is started.
131+
- A benchmark is first run with an infinite request rate (`--request-rate inf`).
132+
- If the resulting P99 E2E latency is within the `MAX_LATENCY_ALLOWED_MS` limit, this throughput is considered the maximum for this configuration.
133+
- If the latency is too high, the script performs a search by iteratively decreasing the request rate until the latency constraint is met. This finds the highest sustainable throughput for the given parameters and latency requirement.
134+
135+
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
136+
137+
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.

benchmarks/auto_tune.sh renamed to benchmarks/auto_tune/auto_tune.sh

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,7 @@
11
#!/bin/bash
22

33
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
4-
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
5-
# It also supports additional requirement: e2e latency and prefix cache.
6-
7-
# Pre-requisite:
8-
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
9-
# 2. If the model is customized, replace the MODEL's config with the customized config.
10-
# 3. Set variables (ALL REQUIRED)
11-
# BASE: your directory for vllm repo
12-
# MODEL: the model served by vllm
13-
# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
14-
# TP: ways of tensor parallelism
15-
# DOWNLOAD_DIR: directory to download and load model weights.
16-
# INPUT_LEN: request input len
17-
# OUTPUT_LEN: request output len
18-
# MIN_CACHE_HIT_PCT: prefix cache rate
19-
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
20-
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
21-
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
22-
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
23-
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
24-
# 5. The final result will be saved in RESULT file.
25-
26-
27-
# Example use cases
28-
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
29-
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
30-
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
31-
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
32-
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
33-
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
4+
# See details in README (benchmarks/auto_tune/README.md).
345

356
TAG=$(date +"%Y_%m_%d_%H_%M")
367
BASE=""

0 commit comments

Comments
 (0)