Pd support (#94)

mayabar · web-flow · commit 996dae32e49f · 2025-07-15T15:39:25.000+03:00
* Add P/D support, respond accordingly to doRemotePrefill and doRemoteDecode fields

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;

* Add test for kvcache transfer time command line parameter.
Update config_test to use a function to create configuration same as defined in the config yaml file

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;

* Update readme file
change command line argument name to 'kv-cache-transfer-latency'

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;

* fixes according PR's comments

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;

* added comments for fields

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;

* fix utils_test - initialize random before

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;

* fixes in readme according the PR review

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;

---------

Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;
diff --git a/README.md b/README.md
@@ -29,11 +29,11 @@ The simulator supports two modes of operation:
 - `echo` mode: the response contains the same text that was received in the request. For `/v1/chat/completions` the last message for the role=`user` is used.
 - `random` mode: the response is randomly chosen from a set of pre-defined sentences.
 
-Timing of the response is defined by two parameters: `time-to-first-token` and `inter-token-latency`. 
+Timing of the response is defined by the `time-to-first-token` and `inter-token-latency` parameters. In case P/D is enabled for a request, `kv-cache-transfer-latency` will be used instead of `time-to-first-token`.
 
-For a request with `stream=true`: `time-to-first-token` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream. 
+For a request with `stream=true`: `time-to-first-token` or `kv-cache-transfer-latency` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream. 
 
-For a requst with `stream=false`: the response is returned after delay of `<time-to-first-token> + (<inter-token-latency> * (<number_of_output_tokens> - 1))`
+For a requst with `stream=false`: the response is returned after delay of `<time-to-first-token> + (<inter-token-latency> * (<number_of_output_tokens> - 1))` or `<kv-cache-transfer-latency> + (<inter-token-latency> * (<number_of_output_tokens> - 1))` in P/D case
 
 It can be run standalone or in a Pod for testing under packages such as Kind.
 
@@ -99,6 +99,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
     - `random`: returns a sentence chosen at random from a set of pre-defined sentences
 - `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero
 - `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
+- `kv-cache-transfer-latency`: time for KV-cache transfer from a remote vLLM (in milliseconds), by default zero. Usually much shorter than `time-to-first-token`
 - `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
 
 In addition, as we are using klog, the following parameters are available:
diff --git a/manifests/basic-config.yaml b/manifests/basic-config.yaml
@@ -0,0 +1,8 @@
+port: 8001
+model: "Qwen/Qwen2-0.5B"
+max-num-seqs: 5
+mode: "random"
+time-to-first-token: 2000
+inter-token-latency: 1000
+kv-cache-transfer-latency: 100
+seed: 100100100
diff --git a/manifests/config.yaml b/manifests/config.yaml
@@ -10,6 +10,7 @@ lora-modules:
 - '{"name":"lora1","path":"/path/to/lora1"}'
 - '{"name":"lora2","path":"/path/to/lora2"}'
 mode: "random"
-time-to-first-token: 2
-inter-token-latency: 1
+time-to-first-token: 2000
+inter-token-latency: 1000
+kv-cache-transfer-latency: 100
 seed: 100100100
diff --git a/pkg/llm-d-inference-sim/config.go b/pkg/llm-d-inference-sim/config.go
@@ -53,6 +53,9 @@ type configuration struct {
 	TimeToFirstToken int `yaml:"time-to-first-token"`
 	// InterTokenLatency time between generated tokens, in milliseconds
 	InterTokenLatency int `yaml:"inter-token-latency"`
+	// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds
+	KVCacheTransferLatency int `yaml:"kv-cache-transfer-latency"`
+
 	// Mode defines the simulator response generation mode, valid values: echo, random
 	Mode string `yaml:"mode"`
 	// Seed defines random seed for operations
@@ -145,6 +148,9 @@ func (c *configuration) validate() error {
 	if c.TimeToFirstToken < 0 {
 		return errors.New("time to first token cannot be negative")
 	}
+	if c.KVCacheTransferLatency < 0 {
+		return errors.New("kv-cache tranfer time cannot be negative")
+	}
 	if c.MaxLoras < 1 {
 		return errors.New("max LoRAs cannot be less than 1")
 	}
diff --git a/pkg/llm-d-inference-sim/config_test.go b/pkg/llm-d-inference-sim/config_test.go
@@ -25,8 +25,7 @@ import (
 )
 
 const (
-	qwenModelName    = "Qwen/Qwen2-0.5B"
-	seedInConfigFile = 100100100
+	qwenModelName = "Qwen/Qwen2-0.5B"
 )
 
 func createSimConfig(args []string) (*configuration, error) {
@@ -46,6 +45,23 @@ func createSimConfig(args []string) (*configuration, error) {
 	return s.config, nil
 }
 
+func createDefaultConfig(model string) *configuration {
+	c := newConfig()
+
+	c.Model = model
+	c.ServedModelNames = []string{c.Model}
+	c.MaxNumSeqs = 5
+	c.MaxLoras = 2
+	c.MaxCPULoras = 5
+	c.TimeToFirstToken = 2000
+	c.InterTokenLatency = 1000
+	c.KVCacheTransferLatency = 100
+	c.Seed = 100100100
+	c.LoraModules = []loraModule{}
+
+	return c
+}
+
 type testCase struct {
 	name           string
 	args           []string
@@ -69,17 +85,10 @@ var _ = Describe("Simulator configuration", func() {
 	tests = append(tests, test)
 
 	// Config from config.yaml file
-	c = newConfig()
+	c = createDefaultConfig(qwenModelName)
 	c.Port = 8001
-	c.Model = qwenModelName
 	c.ServedModelNames = []string{"model1", "model2"}
-	c.MaxLoras = 2
-	c.MaxCPULoras = 5
-	c.MaxNumSeqs = 5
-	c.TimeToFirstToken = 2
-	c.InterTokenLatency = 1
 	c.LoraModules = []loraModule{{Name: "lora1", Path: "/path/to/lora1"}, {Name: "lora2", Path: "/path/to/lora2"}}
-	c.Seed = seedInConfigFile
 	test = testCase{
 		name:           "config file",
 		args:           []string{"cmd", "--config", "../../manifests/config.yaml"},
@@ -92,15 +101,9 @@ var _ = Describe("Simulator configuration", func() {
 	tests = append(tests, test)
 
 	// Config from config.yaml file plus command line args
-	c = newConfig()
+	c = createDefaultConfig(model)
 	c.Port = 8002
-	c.Model = model
 	c.ServedModelNames = []string{"alias1", "alias2"}
-	c.MaxLoras = 2
-	c.MaxCPULoras = 5
-	c.MaxNumSeqs = 5
-	c.TimeToFirstToken = 2
-	c.InterTokenLatency = 1
 	c.Seed = 100
 	c.LoraModules = []loraModule{{Name: "lora3", Path: "/path/to/lora3"}, {Name: "lora4", Path: "/path/to/lora4"}}
 	c.LoraModulesString = []string{
@@ -118,16 +121,8 @@ var _ = Describe("Simulator configuration", func() {
 	tests = append(tests, test)
 
 	// Config from config.yaml file plus command line args with different format
-	c = newConfig()
+	c = createDefaultConfig(model)
 	c.Port = 8002
-	c.Model = model
-	c.ServedModelNames = []string{c.Model}
-	c.MaxLoras = 2
-	c.MaxCPULoras = 5
-	c.MaxNumSeqs = 5
-	c.TimeToFirstToken = 2
-	c.InterTokenLatency = 1
-	c.Seed = seedInConfigFile
 	c.LoraModules = []loraModule{{Name: "lora3", Path: "/path/to/lora3"}}
 	c.LoraModulesString = []string{
 		"{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}",
@@ -143,16 +138,8 @@ var _ = Describe("Simulator configuration", func() {
 	tests = append(tests, test)
 
 	// Config from config.yaml file plus command line args with empty string
-	c = newConfig()
+	c = createDefaultConfig(model)
 	c.Port = 8002
-	c.Model = model
-	c.ServedModelNames = []string{c.Model}
-	c.MaxLoras = 2
-	c.MaxCPULoras = 5
-	c.MaxNumSeqs = 5
-	c.TimeToFirstToken = 2
-	c.InterTokenLatency = 1
-	c.Seed = seedInConfigFile
 	c.LoraModules = []loraModule{{Name: "lora3", Path: "/path/to/lora3"}}
 	c.LoraModulesString = []string{
 		"{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}",
@@ -168,18 +155,10 @@ var _ = Describe("Simulator configuration", func() {
 	tests = append(tests, test)
 
 	// Config from config.yaml file plus command line args with empty string for loras
-	c = newConfig()
+	c = createDefaultConfig(qwenModelName)
 	c.Port = 8001
-	c.Model = qwenModelName
 	c.ServedModelNames = []string{"model1", "model2"}
-	c.MaxLoras = 2
-	c.MaxCPULoras = 5
-	c.MaxNumSeqs = 5
-	c.TimeToFirstToken = 2
-	c.InterTokenLatency = 1
-	c.LoraModules = []loraModule{}
 	c.LoraModulesString = []string{}
-	c.Seed = seedInConfigFile
 	test = testCase{
 		name:           "config file with command line args with empty string for loras",
 		args:           []string{"cmd", "--config", "../../manifests/config.yaml", "--lora-modules", ""},
@@ -188,25 +167,31 @@ var _ = Describe("Simulator configuration", func() {
 	tests = append(tests, test)
 
 	// Config from config.yaml file plus command line args with empty parameter for loras
-	c = newConfig()
+	c = createDefaultConfig(qwenModelName)
 	c.Port = 8001
-	c.Model = qwenModelName
 	c.ServedModelNames = []string{"model1", "model2"}
-	c.MaxLoras = 2
-	c.MaxCPULoras = 5
-	c.MaxNumSeqs = 5
-	c.TimeToFirstToken = 2
-	c.InterTokenLatency = 1
-	c.LoraModules = []loraModule{}
 	c.LoraModulesString = []string{}
-	c.Seed = seedInConfigFile
 	test = testCase{
 		name:           "config file with command line args with empty parameter for loras",
 		args:           []string{"cmd", "--config", "../../manifests/config.yaml", "--lora-modules"},
 		expectedConfig: c,
 	}
 	tests = append(tests, test)
 
+	// Config from config.yaml file plus command line args with time to copy cache
+	c = createDefaultConfig(qwenModelName)
+	c.Port = 8001
+	// basic config file does not contain properties related to lora
+	c.MaxLoras = 1
+	c.MaxCPULoras = 1
+	c.KVCacheTransferLatency = 50
+	test = testCase{
+		name:           "config file with command line args with time to transfer kv-cache",
+		args:           []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv-cache-transfer-latency", "50"},
+		expectedConfig: c,
+	}
+	tests = append(tests, test)
+
 	// Invalid configurations
 	test = testCase{
 		name: "invalid model",
@@ -258,18 +243,19 @@ var _ = Describe("Simulator configuration", func() {
 		Entry(tests[4].name, tests[4].args, tests[4].expectedConfig),
 		Entry(tests[5].name, tests[5].args, tests[5].expectedConfig),
 		Entry(tests[6].name, tests[6].args, tests[6].expectedConfig),
+		Entry(tests[7].name, tests[7].args, tests[7].expectedConfig),
 	)
 
 	DescribeTable("invalid configurations",
 		func(args []string) {
 			_, err := createSimConfig(args)
 			Expect(err).To(HaveOccurred())
 		},
-		Entry(tests[7].name, tests[7].args),
 		Entry(tests[8].name, tests[8].args),
 		Entry(tests[9].name, tests[9].args),
 		Entry(tests[10].name, tests[10].args),
 		Entry(tests[11].name, tests[11].args),
 		Entry(tests[12].name, tests[12].args),
+		Entry(tests[13].name, tests[13].args),
 	)
 })
diff --git a/pkg/llm-d-inference-sim/request.go b/pkg/llm-d-inference-sim/request.go
@@ -44,6 +44,10 @@ type completionRequest interface {
 	getToolChoice() string
 	// getMaxCompletionTokens returns the maximum completion tokens requested
 	getMaxCompletionTokens() *int64
+	// doRemoteDecode() returns true if do_remote_decode field is true in the request, this means that this is prefill request
+	doRemoteDecode() bool
+	// doRemotePrefill() returns true if do_remote_prefill field is true in the request, this means that this is decode request
+	doRemotePrefill() bool
 }
 
 // baseCompletionRequest contains base completion request related information
@@ -54,6 +58,18 @@ type baseCompletionRequest struct {
 	StreamOptions streamOptions `json:"stream_options"`
 	// Model defines Model name to use for "inference", could be base Model name or one of available LoRA adapters
 	Model string `json:"model"`
+	// DoRemoteDecode boolean value, true when request's decode will be done on remote pod
+	DoRemoteDecode bool `json:"do_remote_decode"`
+	// DoRemotePrefill boolean value, true when request's prefill was done on remote pod
+	DoRemotePrefill bool `json:"do_remote_prefill"`
+	// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
+	RemoteBlockIds []string `json:"remote_block_ids"`
+	// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
+	RemoteEngineId string `json:"remote_engine_id"`
+	// RemoteHost is a hostname or IP address of the remote server handling prefill
+	RemoteHost string `json:"remote_host"`
+	// RemotePort is a port of the remote server handling prefill
+	RemotePort int `json:"remote_port"`
 }
 
 // StreamOptions defines streaming options for streaming requests
@@ -74,6 +90,14 @@ func (b *baseCompletionRequest) includeUsage() bool {
 	return !b.Stream || b.StreamOptions.IncludeUsage
 }
 
+func (b *baseCompletionRequest) doRemoteDecode() bool {
+	return b.DoRemoteDecode
+}
+
+func (b *baseCompletionRequest) doRemotePrefill() bool {
+	return b.DoRemotePrefill
+}
+
 // completionReqCtx is a context passed in the simulator's flow, it contains the request data needed
 // to generate the simulator's response
 type completionReqCtx struct {
diff --git a/pkg/llm-d-inference-sim/response.go b/pkg/llm-d-inference-sim/response.go
@@ -38,6 +38,18 @@ type baseCompletionResponse struct {
 	Usage *usage `json:"usage"`
 	// Object is the Object type, "text_completion", "chat.completion", or "chat.completion.chunk"
 	Object string `json:"object"`
+	// DoRemoteDecode boolean value, true when request's decode will be done on remote pod
+	DoRemoteDecode bool `json:"do_remote_decode"`
+	// DoRemotePrefill boolean value, true when request's prefill was done on remote pod
+	DoRemotePrefill bool `json:"do_remote_prefill"`
+	// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
+	RemoteBlockIds []string `json:"remote_block_ids"`
+	// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
+	RemoteEngineId string `json:"remote_engine_id"`
+	// RemoteHost is a hostname or IP address of the remote server handling prefill
+	RemoteHost string `json:"remote_host"`
+	// RemotePort is a port of the remote server handling prefill
+	RemotePort int `json:"remote_port"`
 }
 
 // usage contains token usage statistics
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
diff --git a/pkg/llm-d-inference-sim/utils_test.go b/pkg/llm-d-inference-sim/utils_test.go