Update readme file

mayabar · mayabar · commit 626c432fca3d · 2025-07-14T16:10:36.000+03:00
change command line argument name to 'kv-cache-transfer-latency'

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;
diff --git a/README.md b/README.md
@@ -29,9 +29,9 @@ The simulator supports two modes of operation:
 - `echo` mode: the response contains the same text that was received in the request. For `/v1/chat/completions` the last message for the role=`user` is used.
 - `random` mode: the response is randomly chosen from a set of pre-defined sentences.
 
-Timing of the response is defined by two parameters: `time-to-first-token` and `inter-token-latency`. 
+Timing of the response is defined by `time-to-first-token` and `inter-token-latency` parameters. In case P/D is enabled for a request, `kv-cache-transfer-latency` will be used instead of `time-to-first-token`.
 
-For a request with `stream=true`: `time-to-first-token` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream. 
+For a request with `stream=true`: `time-to-first-token` or `kv-cache-transfer-latency` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream. 
 
 For a requst with `stream=false`: the response is returned after delay of `<time-to-first-token> + (<inter-token-latency> * (<number_of_output_tokens> - 1))`
 
@@ -99,6 +99,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
     - `random`: returns a sentence chosen at random from a set of pre-defined sentences
 - `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero
 - `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
+- `kv-cache-transfer-latency`: time for KV-cache transfer from a remote vLLM (in milliseconds), by default zero. Usually much shorter than `time-to-first-token`
 - `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
 
 In addition, as we are using klog, the following parameters are available:
diff --git a/manifests/basic-config.yaml b/manifests/basic-config.yaml
@@ -4,5 +4,5 @@ max-num-seqs: 5
 mode: "random"
 time-to-first-token: 2000
 inter-token-latency: 1000
-kv_cache_transfer_latency: 100
+kv-cache-transfer-latency: 100
 seed: 100100100
diff --git a/manifests/config.yaml b/manifests/config.yaml
@@ -12,5 +12,5 @@ lora-modules:
 mode: "random"
 time-to-first-token: 2000
 inter-token-latency: 1000
-kv_cache_transfer_latency: 100
+kv-cache-transfer-latency: 100
 seed: 100100100
diff --git a/pkg/llm-d-inference-sim/config.go b/pkg/llm-d-inference-sim/config.go
@@ -54,7 +54,7 @@ type configuration struct {
 	// InterTokenLatency time between generated tokens, in milliseconds
 	InterTokenLatency int `yaml:"inter-token-latency"`
 	// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds
-	KVCacheTransferLatency int `yaml:"kv_cache_transfer_latency"`
+	KVCacheTransferLatency int `yaml:"kv-cache-transfer-latency"`
 
 	// Mode defines the simulator response generation mode, valid values: echo, random
 	Mode string `yaml:"mode"`
diff --git a/pkg/llm-d-inference-sim/config_test.go b/pkg/llm-d-inference-sim/config_test.go
@@ -194,7 +194,7 @@ var _ = Describe("Simulator configuration", func() {
 	c.KVCacheTransferLatency = 50
 	test = testCase{
 		name:           "config file with command line args with time to transfer kv-cache",
-		args:           []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv_cache_transfer_latency", "50"},
+		args:           []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv-cache-transfer-latency", "50"},
 		expectedConfig: c,
 	}
 	tests = append(tests, test)
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -156,7 +156,7 @@ func (s *VllmSimulator) parseCommandParamsAndLoadConfig() error {
 	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
-	f.IntVar(&config.KVCacheTransferLatency, "kv_cache_transfer_latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
+	f.IntVar(&config.KVCacheTransferLatency, "kv-cache-transfer-latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
 
 	// These values were manually parsed above in getParamValueFromArgs, we leave this in order to get these flags in --help

Original file line number	Diff line number	Diff line change
`@@ -194,7 +194,7 @@ var _ = Describe("Simulator configuration", func() {`
`194`	`194`	`c.KVCacheTransferLatency = 50`
`195`	`195`	`test = testCase{`
`196`	`196`	`name: "config file with command line args with time to transfer kv-cache",`
`197`		`- args: []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv_cache_transfer_latency", "50"},`
	`197`	`+ args: []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv-cache-transfer-latency", "50"},`
`198`	`198`	`expectedConfig: c,`
`199`	`199`	`}`
`200`	`200`	`tests = append(tests, test)`