Skip to content

Commit 996dae3

Browse files
authored
Pd support (#94)
* Add P/D support, respond accordingly to doRemotePrefill and doRemoteDecode fields Signed-off-by: Maya Barnea <mayab@il.ibm.com> * Add test for kvcache transfer time command line parameter. Update config_test to use a function to create configuration same as defined in the config yaml file Signed-off-by: Maya Barnea <mayab@il.ibm.com> * Update readme file change command line argument name to 'kv-cache-transfer-latency' Signed-off-by: Maya Barnea <mayab@il.ibm.com> * fixes according PR's comments Signed-off-by: Maya Barnea <mayab@il.ibm.com> * added comments for fields Signed-off-by: Maya Barnea <mayab@il.ibm.com> * fix utils_test - initialize random before Signed-off-by: Maya Barnea <mayab@il.ibm.com> * fixes in readme according the PR review Signed-off-by: Maya Barnea <mayab@il.ibm.com> --------- Signed-off-by: Maya Barnea <mayab@il.ibm.com>
1 parent a0109a3 commit 996dae3

File tree

10 files changed

+157
-72
lines changed

10 files changed

+157
-72
lines changed

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ The simulator supports two modes of operation:
2929
- `echo` mode: the response contains the same text that was received in the request. For `/v1/chat/completions` the last message for the role=`user` is used.
3030
- `random` mode: the response is randomly chosen from a set of pre-defined sentences.
3131

32-
Timing of the response is defined by two parameters: `time-to-first-token` and `inter-token-latency`.
32+
Timing of the response is defined by the `time-to-first-token` and `inter-token-latency` parameters. In case P/D is enabled for a request, `kv-cache-transfer-latency` will be used instead of `time-to-first-token`.
3333

34-
For a request with `stream=true`: `time-to-first-token` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream.
34+
For a request with `stream=true`: `time-to-first-token` or `kv-cache-transfer-latency` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream.
3535

36-
For a requst with `stream=false`: the response is returned after delay of `<time-to-first-token> + (<inter-token-latency> * (<number_of_output_tokens> - 1))`
36+
For a requst with `stream=false`: the response is returned after delay of `<time-to-first-token> + (<inter-token-latency> * (<number_of_output_tokens> - 1))` or `<kv-cache-transfer-latency> + (<inter-token-latency> * (<number_of_output_tokens> - 1))` in P/D case
3737

3838
It can be run standalone or in a Pod for testing under packages such as Kind.
3939

@@ -99,6 +99,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
9999
- `random`: returns a sentence chosen at random from a set of pre-defined sentences
100100
- `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero
101101
- `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
102+
- `kv-cache-transfer-latency`: time for KV-cache transfer from a remote vLLM (in milliseconds), by default zero. Usually much shorter than `time-to-first-token`
102103
- `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used)
103104

104105
In addition, as we are using klog, the following parameters are available:

manifests/basic-config.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
port: 8001
2+
model: "Qwen/Qwen2-0.5B"
3+
max-num-seqs: 5
4+
mode: "random"
5+
time-to-first-token: 2000
6+
inter-token-latency: 1000
7+
kv-cache-transfer-latency: 100
8+
seed: 100100100

manifests/config.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ lora-modules:
1010
- '{"name":"lora1","path":"/path/to/lora1"}'
1111
- '{"name":"lora2","path":"/path/to/lora2"}'
1212
mode: "random"
13-
time-to-first-token: 2
14-
inter-token-latency: 1
13+
time-to-first-token: 2000
14+
inter-token-latency: 1000
15+
kv-cache-transfer-latency: 100
1516
seed: 100100100

pkg/llm-d-inference-sim/config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ type configuration struct {
5353
TimeToFirstToken int `yaml:"time-to-first-token"`
5454
// InterTokenLatency time between generated tokens, in milliseconds
5555
InterTokenLatency int `yaml:"inter-token-latency"`
56+
// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds
57+
KVCacheTransferLatency int `yaml:"kv-cache-transfer-latency"`
58+
5659
// Mode defines the simulator response generation mode, valid values: echo, random
5760
Mode string `yaml:"mode"`
5861
// Seed defines random seed for operations
@@ -145,6 +148,9 @@ func (c *configuration) validate() error {
145148
if c.TimeToFirstToken < 0 {
146149
return errors.New("time to first token cannot be negative")
147150
}
151+
if c.KVCacheTransferLatency < 0 {
152+
return errors.New("kv-cache tranfer time cannot be negative")
153+
}
148154
if c.MaxLoras < 1 {
149155
return errors.New("max LoRAs cannot be less than 1")
150156
}

pkg/llm-d-inference-sim/config_test.go

Lines changed: 40 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@ import (
2525
)
2626

2727
const (
28-
qwenModelName = "Qwen/Qwen2-0.5B"
29-
seedInConfigFile = 100100100
28+
qwenModelName = "Qwen/Qwen2-0.5B"
3029
)
3130

3231
func createSimConfig(args []string) (*configuration, error) {
@@ -46,6 +45,23 @@ func createSimConfig(args []string) (*configuration, error) {
4645
return s.config, nil
4746
}
4847

48+
func createDefaultConfig(model string) *configuration {
49+
c := newConfig()
50+
51+
c.Model = model
52+
c.ServedModelNames = []string{c.Model}
53+
c.MaxNumSeqs = 5
54+
c.MaxLoras = 2
55+
c.MaxCPULoras = 5
56+
c.TimeToFirstToken = 2000
57+
c.InterTokenLatency = 1000
58+
c.KVCacheTransferLatency = 100
59+
c.Seed = 100100100
60+
c.LoraModules = []loraModule{}
61+
62+
return c
63+
}
64+
4965
type testCase struct {
5066
name string
5167
args []string
@@ -69,17 +85,10 @@ var _ = Describe("Simulator configuration", func() {
6985
tests = append(tests, test)
7086

7187
// Config from config.yaml file
72-
c = newConfig()
88+
c = createDefaultConfig(qwenModelName)
7389
c.Port = 8001
74-
c.Model = qwenModelName
7590
c.ServedModelNames = []string{"model1", "model2"}
76-
c.MaxLoras = 2
77-
c.MaxCPULoras = 5
78-
c.MaxNumSeqs = 5
79-
c.TimeToFirstToken = 2
80-
c.InterTokenLatency = 1
8191
c.LoraModules = []loraModule{{Name: "lora1", Path: "/path/to/lora1"}, {Name: "lora2", Path: "/path/to/lora2"}}
82-
c.Seed = seedInConfigFile
8392
test = testCase{
8493
name: "config file",
8594
args: []string{"cmd", "--config", "../../manifests/config.yaml"},
@@ -92,15 +101,9 @@ var _ = Describe("Simulator configuration", func() {
92101
tests = append(tests, test)
93102

94103
// Config from config.yaml file plus command line args
95-
c = newConfig()
104+
c = createDefaultConfig(model)
96105
c.Port = 8002
97-
c.Model = model
98106
c.ServedModelNames = []string{"alias1", "alias2"}
99-
c.MaxLoras = 2
100-
c.MaxCPULoras = 5
101-
c.MaxNumSeqs = 5
102-
c.TimeToFirstToken = 2
103-
c.InterTokenLatency = 1
104107
c.Seed = 100
105108
c.LoraModules = []loraModule{{Name: "lora3", Path: "/path/to/lora3"}, {Name: "lora4", Path: "/path/to/lora4"}}
106109
c.LoraModulesString = []string{
@@ -118,16 +121,8 @@ var _ = Describe("Simulator configuration", func() {
118121
tests = append(tests, test)
119122

120123
// Config from config.yaml file plus command line args with different format
121-
c = newConfig()
124+
c = createDefaultConfig(model)
122125
c.Port = 8002
123-
c.Model = model
124-
c.ServedModelNames = []string{c.Model}
125-
c.MaxLoras = 2
126-
c.MaxCPULoras = 5
127-
c.MaxNumSeqs = 5
128-
c.TimeToFirstToken = 2
129-
c.InterTokenLatency = 1
130-
c.Seed = seedInConfigFile
131126
c.LoraModules = []loraModule{{Name: "lora3", Path: "/path/to/lora3"}}
132127
c.LoraModulesString = []string{
133128
"{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}",
@@ -143,16 +138,8 @@ var _ = Describe("Simulator configuration", func() {
143138
tests = append(tests, test)
144139

145140
// Config from config.yaml file plus command line args with empty string
146-
c = newConfig()
141+
c = createDefaultConfig(model)
147142
c.Port = 8002
148-
c.Model = model
149-
c.ServedModelNames = []string{c.Model}
150-
c.MaxLoras = 2
151-
c.MaxCPULoras = 5
152-
c.MaxNumSeqs = 5
153-
c.TimeToFirstToken = 2
154-
c.InterTokenLatency = 1
155-
c.Seed = seedInConfigFile
156143
c.LoraModules = []loraModule{{Name: "lora3", Path: "/path/to/lora3"}}
157144
c.LoraModulesString = []string{
158145
"{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}",
@@ -168,18 +155,10 @@ var _ = Describe("Simulator configuration", func() {
168155
tests = append(tests, test)
169156

170157
// Config from config.yaml file plus command line args with empty string for loras
171-
c = newConfig()
158+
c = createDefaultConfig(qwenModelName)
172159
c.Port = 8001
173-
c.Model = qwenModelName
174160
c.ServedModelNames = []string{"model1", "model2"}
175-
c.MaxLoras = 2
176-
c.MaxCPULoras = 5
177-
c.MaxNumSeqs = 5
178-
c.TimeToFirstToken = 2
179-
c.InterTokenLatency = 1
180-
c.LoraModules = []loraModule{}
181161
c.LoraModulesString = []string{}
182-
c.Seed = seedInConfigFile
183162
test = testCase{
184163
name: "config file with command line args with empty string for loras",
185164
args: []string{"cmd", "--config", "../../manifests/config.yaml", "--lora-modules", ""},
@@ -188,25 +167,31 @@ var _ = Describe("Simulator configuration", func() {
188167
tests = append(tests, test)
189168

190169
// Config from config.yaml file plus command line args with empty parameter for loras
191-
c = newConfig()
170+
c = createDefaultConfig(qwenModelName)
192171
c.Port = 8001
193-
c.Model = qwenModelName
194172
c.ServedModelNames = []string{"model1", "model2"}
195-
c.MaxLoras = 2
196-
c.MaxCPULoras = 5
197-
c.MaxNumSeqs = 5
198-
c.TimeToFirstToken = 2
199-
c.InterTokenLatency = 1
200-
c.LoraModules = []loraModule{}
201173
c.LoraModulesString = []string{}
202-
c.Seed = seedInConfigFile
203174
test = testCase{
204175
name: "config file with command line args with empty parameter for loras",
205176
args: []string{"cmd", "--config", "../../manifests/config.yaml", "--lora-modules"},
206177
expectedConfig: c,
207178
}
208179
tests = append(tests, test)
209180

181+
// Config from config.yaml file plus command line args with time to copy cache
182+
c = createDefaultConfig(qwenModelName)
183+
c.Port = 8001
184+
// basic config file does not contain properties related to lora
185+
c.MaxLoras = 1
186+
c.MaxCPULoras = 1
187+
c.KVCacheTransferLatency = 50
188+
test = testCase{
189+
name: "config file with command line args with time to transfer kv-cache",
190+
args: []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv-cache-transfer-latency", "50"},
191+
expectedConfig: c,
192+
}
193+
tests = append(tests, test)
194+
210195
// Invalid configurations
211196
test = testCase{
212197
name: "invalid model",
@@ -258,18 +243,19 @@ var _ = Describe("Simulator configuration", func() {
258243
Entry(tests[4].name, tests[4].args, tests[4].expectedConfig),
259244
Entry(tests[5].name, tests[5].args, tests[5].expectedConfig),
260245
Entry(tests[6].name, tests[6].args, tests[6].expectedConfig),
246+
Entry(tests[7].name, tests[7].args, tests[7].expectedConfig),
261247
)
262248

263249
DescribeTable("invalid configurations",
264250
func(args []string) {
265251
_, err := createSimConfig(args)
266252
Expect(err).To(HaveOccurred())
267253
},
268-
Entry(tests[7].name, tests[7].args),
269254
Entry(tests[8].name, tests[8].args),
270255
Entry(tests[9].name, tests[9].args),
271256
Entry(tests[10].name, tests[10].args),
272257
Entry(tests[11].name, tests[11].args),
273258
Entry(tests[12].name, tests[12].args),
259+
Entry(tests[13].name, tests[13].args),
274260
)
275261
})

pkg/llm-d-inference-sim/request.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@ type completionRequest interface {
4444
getToolChoice() string
4545
// getMaxCompletionTokens returns the maximum completion tokens requested
4646
getMaxCompletionTokens() *int64
47+
// doRemoteDecode() returns true if do_remote_decode field is true in the request, this means that this is prefill request
48+
doRemoteDecode() bool
49+
// doRemotePrefill() returns true if do_remote_prefill field is true in the request, this means that this is decode request
50+
doRemotePrefill() bool
4751
}
4852

4953
// baseCompletionRequest contains base completion request related information
@@ -54,6 +58,18 @@ type baseCompletionRequest struct {
5458
StreamOptions streamOptions `json:"stream_options"`
5559
// Model defines Model name to use for "inference", could be base Model name or one of available LoRA adapters
5660
Model string `json:"model"`
61+
// DoRemoteDecode boolean value, true when request's decode will be done on remote pod
62+
DoRemoteDecode bool `json:"do_remote_decode"`
63+
// DoRemotePrefill boolean value, true when request's prefill was done on remote pod
64+
DoRemotePrefill bool `json:"do_remote_prefill"`
65+
// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
66+
RemoteBlockIds []string `json:"remote_block_ids"`
67+
// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
68+
RemoteEngineId string `json:"remote_engine_id"`
69+
// RemoteHost is a hostname or IP address of the remote server handling prefill
70+
RemoteHost string `json:"remote_host"`
71+
// RemotePort is a port of the remote server handling prefill
72+
RemotePort int `json:"remote_port"`
5773
}
5874

5975
// StreamOptions defines streaming options for streaming requests
@@ -74,6 +90,14 @@ func (b *baseCompletionRequest) includeUsage() bool {
7490
return !b.Stream || b.StreamOptions.IncludeUsage
7591
}
7692

93+
func (b *baseCompletionRequest) doRemoteDecode() bool {
94+
return b.DoRemoteDecode
95+
}
96+
97+
func (b *baseCompletionRequest) doRemotePrefill() bool {
98+
return b.DoRemotePrefill
99+
}
100+
77101
// completionReqCtx is a context passed in the simulator's flow, it contains the request data needed
78102
// to generate the simulator's response
79103
type completionReqCtx struct {

pkg/llm-d-inference-sim/response.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,18 @@ type baseCompletionResponse struct {
3838
Usage *usage `json:"usage"`
3939
// Object is the Object type, "text_completion", "chat.completion", or "chat.completion.chunk"
4040
Object string `json:"object"`
41+
// DoRemoteDecode boolean value, true when request's decode will be done on remote pod
42+
DoRemoteDecode bool `json:"do_remote_decode"`
43+
// DoRemotePrefill boolean value, true when request's prefill was done on remote pod
44+
DoRemotePrefill bool `json:"do_remote_prefill"`
45+
// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
46+
RemoteBlockIds []string `json:"remote_block_ids"`
47+
// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
48+
RemoteEngineId string `json:"remote_engine_id"`
49+
// RemoteHost is a hostname or IP address of the remote server handling prefill
50+
RemoteHost string `json:"remote_host"`
51+
// RemotePort is a port of the remote server handling prefill
52+
RemotePort int `json:"remote_port"`
4153
}
4254

4355
// usage contains token usage statistics

0 commit comments

Comments
 (0)