Skip to content

Commit 626c432

Browse files
committed
Update readme file
change command line argument name to 'kv-cache-transfer-latency' Signed-off-by: Maya Barnea <mayab@il.ibm.com>
1 parent 276f15a commit 626c432

File tree

6 files changed

+8
-7
lines changed

6 files changed

+8
-7
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ The simulator supports two modes of operation:
2929
- `echo` mode: the response contains the same text that was received in the request. For `/v1/chat/completions` the last message for the role=`user` is used.
3030
- `random` mode: the response is randomly chosen from a set of pre-defined sentences.
3131

32-
Timing of the response is defined by two parameters: `time-to-first-token` and `inter-token-latency`.
32+
Timing of the response is defined by `time-to-first-token` and `inter-token-latency` parameters. In case P/D is enabled for a request, `kv-cache-transfer-latency` will be used instead of `time-to-first-token`.
3333

34-
For a request with `stream=true`: `time-to-first-token` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream.
34+
For a request with `stream=true`: `time-to-first-token` or `kv-cache-transfer-latency` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream.
3535

3636
For a requst with `stream=false`: the response is returned after delay of `<time-to-first-token> + (<inter-token-latency> * (<number_of_output_tokens> - 1))`
3737

@@ -99,6 +99,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
9999
- `random`: returns a sentence chosen at random from a set of pre-defined sentences
100100
- `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero
101101
- `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
102+
- `kv-cache-transfer-latency`: time for KV-cache transfer from a remote vLLM (in milliseconds), by default zero. Usually much shorter than `time-to-first-token`
102103
- `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
103104

104105
In addition, as we are using klog, the following parameters are available:

manifests/basic-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ max-num-seqs: 5
44
mode: "random"
55
time-to-first-token: 2000
66
inter-token-latency: 1000
7-
kv_cache_transfer_latency: 100
7+
kv-cache-transfer-latency: 100
88
seed: 100100100

manifests/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@ lora-modules:
1212
mode: "random"
1313
time-to-first-token: 2000
1414
inter-token-latency: 1000
15-
kv_cache_transfer_latency: 100
15+
kv-cache-transfer-latency: 100
1616
seed: 100100100

pkg/llm-d-inference-sim/config.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ type configuration struct {
5454
// InterTokenLatency time between generated tokens, in milliseconds
5555
InterTokenLatency int `yaml:"inter-token-latency"`
5656
// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds
57-
KVCacheTransferLatency int `yaml:"kv_cache_transfer_latency"`
57+
KVCacheTransferLatency int `yaml:"kv-cache-transfer-latency"`
5858

5959
// Mode defines the simulator response generation mode, valid values: echo, random
6060
Mode string `yaml:"mode"`

pkg/llm-d-inference-sim/config_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ var _ = Describe("Simulator configuration", func() {
194194
c.KVCacheTransferLatency = 50
195195
test = testCase{
196196
name: "config file with command line args with time to transfer kv-cache",
197-
args: []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv_cache_transfer_latency", "50"},
197+
args: []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv-cache-transfer-latency", "50"},
198198
expectedConfig: c,
199199
}
200200
tests = append(tests, test)

pkg/llm-d-inference-sim/simulator.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ func (s *VllmSimulator) parseCommandParamsAndLoadConfig() error {
156156
f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
157157
f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
158158
f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
159-
f.IntVar(&config.KVCacheTransferLatency, "kv_cache_transfer_latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
159+
f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
160160
f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
161161

162162
// These values were manually parsed above in getParamValueFromArgs, we leave this in order to get these flags in --help

0 commit comments

Comments
 (0)