-
Notifications
You must be signed in to change notification settings - Fork 16
Pd support #94
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Pd support #94
Changes from 6 commits
10d5088
276f15a
626c432
98b9585
651c3b0
d061f72
3c699b8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,9 +29,9 @@ The simulator supports two modes of operation: | |
- `echo` mode: the response contains the same text that was received in the request. For `/v1/chat/completions` the last message for the role=`user` is used. | ||
- `random` mode: the response is randomly chosen from a set of pre-defined sentences. | ||
|
||
Timing of the response is defined by two parameters: `time-to-first-token` and `inter-token-latency`. | ||
Timing of the response is defined by `time-to-first-token` and `inter-token-latency` parameters. In case P/D is enabled for a request, `kv-cache-transfer-latency` will be used instead of `time-to-first-token`. | ||
|
||
For a request with `stream=true`: `time-to-first-token` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream. | ||
For a request with `stream=true`: `time-to-first-token` or `kv-cache-transfer-latency` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream. | ||
|
||
For a requst with `stream=false`: the response is returned after delay of `<time-to-first-token> + (<inter-token-latency> * (<number_of_output_tokens> - 1))` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why wasn't kv-cache-transfer-latency mentioned here? |
||
|
||
|
@@ -99,6 +99,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started | |
- `random`: returns a sentence chosen at random from a set of pre-defined sentences | ||
- `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero | ||
- `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero | ||
- `kv-cache-transfer-latency`: time for KV-cache transfer from a remote vLLM (in milliseconds), by default zero. Usually much shorter than `time-to-first-token` | ||
- `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used) | ||
|
||
In addition, as we are using klog, the following parameters are available: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
port: 8001 | ||
model: "Qwen/Qwen2-0.5B" | ||
max-num-seqs: 5 | ||
mode: "random" | ||
time-to-first-token: 2000 | ||
inter-token-latency: 1000 | ||
kv-cache-transfer-latency: 100 | ||
seed: 100100100 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,8 +25,7 @@ import ( | |
) | ||
|
||
const ( | ||
qwenModelName = "Qwen/Qwen2-0.5B" | ||
seedInConfigFile = 100100100 | ||
qwenModelName = "Qwen/Qwen2-0.5B" | ||
) | ||
|
||
func createSimConfig(args []string) (*configuration, error) { | ||
|
@@ -46,6 +45,23 @@ func createSimConfig(args []string) (*configuration, error) { | |
return s.config, nil | ||
} | ||
|
||
func createDefaultConfig(model string) *configuration { | ||
c := newConfig() | ||
|
||
c.Model = model | ||
c.ServedModelNames = []string{c.Model} | ||
c.MaxNumSeqs = 5 | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.TimeToFirstToken = 2000 | ||
c.InterTokenLatency = 1000 | ||
c.KVCacheTransferLatency = 100 | ||
c.Seed = 100100100 | ||
c.LoraModules = []loraModule{} | ||
|
||
return c | ||
} | ||
|
||
type testCase struct { | ||
name string | ||
args []string | ||
|
@@ -69,17 +85,10 @@ var _ = Describe("Simulator configuration", func() { | |
tests = append(tests, test) | ||
|
||
// Config from config.yaml file | ||
c = newConfig() | ||
c = createDefaultConfig(qwenModelName) | ||
c.Port = 8001 | ||
c.Model = qwenModelName | ||
c.ServedModelNames = []string{"model1", "model2"} | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.MaxNumSeqs = 5 | ||
c.TimeToFirstToken = 2 | ||
c.InterTokenLatency = 1 | ||
c.LoraModules = []loraModule{{Name: "lora1", Path: "/path/to/lora1"}, {Name: "lora2", Path: "/path/to/lora2"}} | ||
c.Seed = seedInConfigFile | ||
test = testCase{ | ||
name: "config file", | ||
args: []string{"cmd", "--config", "../../manifests/config.yaml"}, | ||
|
@@ -92,15 +101,9 @@ var _ = Describe("Simulator configuration", func() { | |
tests = append(tests, test) | ||
|
||
// Config from config.yaml file plus command line args | ||
c = newConfig() | ||
c = createDefaultConfig(model) | ||
c.Port = 8002 | ||
c.Model = model | ||
c.ServedModelNames = []string{"alias1", "alias2"} | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.MaxNumSeqs = 5 | ||
c.TimeToFirstToken = 2 | ||
c.InterTokenLatency = 1 | ||
c.Seed = 100 | ||
c.LoraModules = []loraModule{{Name: "lora3", Path: "/path/to/lora3"}, {Name: "lora4", Path: "/path/to/lora4"}} | ||
c.LoraModulesString = []string{ | ||
|
@@ -118,16 +121,8 @@ var _ = Describe("Simulator configuration", func() { | |
tests = append(tests, test) | ||
|
||
// Config from config.yaml file plus command line args with different format | ||
c = newConfig() | ||
c = createDefaultConfig(model) | ||
c.Port = 8002 | ||
c.Model = model | ||
c.ServedModelNames = []string{c.Model} | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.MaxNumSeqs = 5 | ||
c.TimeToFirstToken = 2 | ||
c.InterTokenLatency = 1 | ||
c.Seed = seedInConfigFile | ||
c.LoraModules = []loraModule{{Name: "lora3", Path: "/path/to/lora3"}} | ||
c.LoraModulesString = []string{ | ||
"{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}", | ||
|
@@ -143,16 +138,8 @@ var _ = Describe("Simulator configuration", func() { | |
tests = append(tests, test) | ||
|
||
// Config from config.yaml file plus command line args with empty string | ||
c = newConfig() | ||
c = createDefaultConfig(model) | ||
c.Port = 8002 | ||
c.Model = model | ||
c.ServedModelNames = []string{c.Model} | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.MaxNumSeqs = 5 | ||
c.TimeToFirstToken = 2 | ||
c.InterTokenLatency = 1 | ||
c.Seed = seedInConfigFile | ||
c.LoraModules = []loraModule{{Name: "lora3", Path: "/path/to/lora3"}} | ||
c.LoraModulesString = []string{ | ||
"{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}", | ||
|
@@ -168,18 +155,10 @@ var _ = Describe("Simulator configuration", func() { | |
tests = append(tests, test) | ||
|
||
// Config from config.yaml file plus command line args with empty string for loras | ||
c = newConfig() | ||
c = createDefaultConfig(qwenModelName) | ||
c.Port = 8001 | ||
c.Model = qwenModelName | ||
c.ServedModelNames = []string{"model1", "model2"} | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.MaxNumSeqs = 5 | ||
c.TimeToFirstToken = 2 | ||
c.InterTokenLatency = 1 | ||
c.LoraModules = []loraModule{} | ||
c.LoraModulesString = []string{} | ||
c.Seed = seedInConfigFile | ||
test = testCase{ | ||
name: "config file with command line args with empty string for loras", | ||
args: []string{"cmd", "--config", "../../manifests/config.yaml", "--lora-modules", ""}, | ||
|
@@ -188,25 +167,31 @@ var _ = Describe("Simulator configuration", func() { | |
tests = append(tests, test) | ||
|
||
// Config from config.yaml file plus command line args with empty parameter for loras | ||
c = newConfig() | ||
c = createDefaultConfig(qwenModelName) | ||
c.Port = 8001 | ||
c.Model = qwenModelName | ||
c.ServedModelNames = []string{"model1", "model2"} | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.MaxNumSeqs = 5 | ||
c.TimeToFirstToken = 2 | ||
c.InterTokenLatency = 1 | ||
c.LoraModules = []loraModule{} | ||
c.LoraModulesString = []string{} | ||
c.Seed = seedInConfigFile | ||
test = testCase{ | ||
name: "config file with command line args with empty parameter for loras", | ||
args: []string{"cmd", "--config", "../../manifests/config.yaml", "--lora-modules"}, | ||
expectedConfig: c, | ||
} | ||
tests = append(tests, test) | ||
|
||
// Config from config.yaml file plus command line args with time to copy cache | ||
c = createDefaultConfig(qwenModelName) | ||
c.Port = 8001 | ||
// basic config file does not contain properties related to lora | ||
c.MaxLoras = 1 | ||
c.MaxCPULoras = 1 | ||
c.KVCacheTransferLatency = 50 | ||
test = testCase{ | ||
name: "config file with command line args with time to transfer kv-cache", | ||
args: []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv-cache-transfer-latency", "50"}, | ||
expectedConfig: c, | ||
} | ||
tests = append(tests, test) | ||
|
||
// Invalid configurations | ||
test = testCase{ | ||
name: "invalid model", | ||
|
@@ -258,18 +243,19 @@ var _ = Describe("Simulator configuration", func() { | |
Entry(tests[4].name, tests[4].args, tests[4].expectedConfig), | ||
Entry(tests[5].name, tests[5].args, tests[5].expectedConfig), | ||
Entry(tests[6].name, tests[6].args, tests[6].expectedConfig), | ||
Entry(tests[7].name, tests[7].args, tests[7].expectedConfig), | ||
) | ||
|
||
DescribeTable("invalid configurations", | ||
func(args []string) { | ||
_, err := createSimConfig(args) | ||
Expect(err).To(HaveOccurred()) | ||
}, | ||
Entry(tests[7].name, tests[7].args), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You removed a test here instead of only increasing the indices There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added test 13 |
||
Entry(tests[8].name, tests[8].args), | ||
Entry(tests[9].name, tests[9].args), | ||
Entry(tests[10].name, tests[10].args), | ||
Entry(tests[11].name, tests[11].args), | ||
Entry(tests[12].name, tests[12].args), | ||
Entry(tests[13].name, tests[13].args), | ||
) | ||
}) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,6 +44,10 @@ type completionRequest interface { | |
getToolChoice() string | ||
// getMaxCompletionTokens returns the maximum completion tokens requested | ||
getMaxCompletionTokens() *int64 | ||
// doRemoteDecode() returns true if do_remote_decode field is true in the request, this means that this is prefill request | ||
doRemoteDecode() bool | ||
// doRemotePrefill() returns true if do_remote_prefill field is true in the request, this means that this is decode request | ||
doRemotePrefill() bool | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The names in the comments don't match the actual names There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
} | ||
|
||
// baseCompletionRequest contains base completion request related information | ||
|
@@ -54,6 +58,18 @@ type baseCompletionRequest struct { | |
StreamOptions streamOptions `json:"stream_options"` | ||
// Model defines Model name to use for "inference", could be base Model name or one of available LoRA adapters | ||
Model string `json:"model"` | ||
// DoRemoteDecode boolean value, true when request's decode will be done on remote pod | ||
DoRemoteDecode bool `json:"do_remote_decode"` | ||
// DoRemotePrefill boolean value, true when request's prefill was done on remote pod | ||
DoRemotePrefill bool `json:"do_remote_prefill"` | ||
// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding | ||
RemoteBlockIds []string `json:"remote_block_ids"` | ||
// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests | ||
RemoteEngineId string `json:"remote_engine_id"` | ||
// RemoteHost is a hostname or IP address of the remote server handling prefill | ||
RemoteHost string `json:"remote_host"` | ||
// RemotePort is a port of the remote server handling prefill | ||
RemotePort int `json:"remote_port"` | ||
} | ||
|
||
// StreamOptions defines streaming options for streaming requests | ||
|
@@ -74,6 +90,14 @@ func (b *baseCompletionRequest) includeUsage() bool { | |
return !b.Stream || b.StreamOptions.IncludeUsage | ||
} | ||
|
||
func (b *baseCompletionRequest) doRemoteDecode() bool { | ||
return b.DoRemoteDecode | ||
} | ||
|
||
func (b *baseCompletionRequest) doRemotePrefill() bool { | ||
return b.DoRemotePrefill | ||
} | ||
|
||
// completionReqCtx is a context passed in the simulator's flow, it contains the request data needed | ||
// to generate the simulator's response | ||
type completionReqCtx struct { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
replace: is defined by `time-to-first-token
with: is defined by the
time-to-first-token
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done