-
Notifications
You must be signed in to change notification settings - Fork 16
Pd support #94
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Pd support #94
Changes from 3 commits
10d5088
276f15a
626c432
98b9585
651c3b0
d061f72
3c699b8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,9 +29,9 @@ The simulator supports two modes of operation: | |
- `echo` mode: the response contains the same text that was received in the request. For `/v1/chat/completions` the last message for the role=`user` is used. | ||
- `random` mode: the response is randomly chosen from a set of pre-defined sentences. | ||
|
||
Timing of the response is defined by two parameters: `time-to-first-token` and `inter-token-latency`. | ||
Timing of the response is defined by `time-to-first-token` and `inter-token-latency` parameters. In case P/D is enabled for a request, `kv-cache-transfer-latency` will be used instead of `time-to-first-token`. | ||
|
||
For a request with `stream=true`: `time-to-first-token` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream. | ||
For a request with `stream=true`: `time-to-first-token` or `kv-cache-transfer-latency` defines the delay before the first token is returned, `inter-token-latency` defines the delay between subsequent tokens in the stream. | ||
|
||
For a requst with `stream=false`: the response is returned after delay of `<time-to-first-token> + (<inter-token-latency> * (<number_of_output_tokens> - 1))` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why wasn't kv-cache-transfer-latency mentioned here? |
||
|
||
|
@@ -99,6 +99,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started | |
- `random`: returns a sentence chosen at random from a set of pre-defined sentences | ||
- `time-to-first-token`: the time to the first token (in milliseconds), optional, by default zero | ||
- `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero | ||
- `kv-cache-transfer-latency`: time for KV-cache transfer from a remote vLLM (in milliseconds), by default zero. Usually much shorter than `time-to-first-token` | ||
- `seed`: random seed for operations (if not set, current Unix time in nanoseconds is used) | ||
|
||
In addition, as we are using klog, the following parameters are available: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
port: 8001 | ||
model: "Qwen/Qwen2-0.5B" | ||
max-num-seqs: 5 | ||
mode: "random" | ||
time-to-first-token: 2000 | ||
inter-token-latency: 1000 | ||
kv-cache-transfer-latency: 100 | ||
seed: 100100100 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,8 +25,7 @@ import ( | |
) | ||
|
||
const ( | ||
qwenModelName = "Qwen/Qwen2-0.5B" | ||
seedInConfigFile = 100100100 | ||
qwenModelName = "Qwen/Qwen2-0.5B" | ||
) | ||
|
||
func createSimConfig(args []string) (*configuration, error) { | ||
|
@@ -46,6 +45,33 @@ func createSimConfig(args []string) (*configuration, error) { | |
return s.config, nil | ||
} | ||
|
||
func createDefaultBasicConfig(model string) *configuration { | ||
c := newConfig() | ||
|
||
c.Model = model | ||
c.ServedModelNames = []string{c.Model} | ||
c.MaxNumSeqs = 5 | ||
c.MaxLoras = 1 | ||
c.MaxCPULoras = 1 | ||
c.TimeToFirstToken = 2000 | ||
c.InterTokenLatency = 1000 | ||
c.KVCacheTransferLatency = 100 | ||
c.Seed = 100100100 | ||
c.LoraModules = []loraModule{} | ||
|
||
return c | ||
} | ||
|
||
func createDefaultConfig(model string) *configuration { | ||
c := createDefaultBasicConfig(model) | ||
|
||
// parameters special to config.yaml | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
|
||
return c | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is confusing, and one function is enough. There is only one case for createDefaultBasicConfig, and we can just update the lora parameters in the test. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. basicConfig function was removed |
||
type testCase struct { | ||
name string | ||
args []string | ||
|
@@ -69,17 +95,10 @@ var _ = Describe("Simulator configuration", func() { | |
tests = append(tests, test) | ||
|
||
// Config from config.yaml file | ||
c = newConfig() | ||
c = createDefaultConfig(qwenModelName) | ||
c.Port = 8001 | ||
c.Model = qwenModelName | ||
c.ServedModelNames = []string{"model1", "model2"} | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.MaxNumSeqs = 5 | ||
c.TimeToFirstToken = 2 | ||
c.InterTokenLatency = 1 | ||
c.LoraModules = []loraModule{{Name: "lora1", Path: "/path/to/lora1"}, {Name: "lora2", Path: "/path/to/lora2"}} | ||
c.Seed = seedInConfigFile | ||
test = testCase{ | ||
name: "config file", | ||
args: []string{"cmd", "--config", "../../manifests/config.yaml"}, | ||
|
@@ -92,15 +111,9 @@ var _ = Describe("Simulator configuration", func() { | |
tests = append(tests, test) | ||
|
||
// Config from config.yaml file plus command line args | ||
c = newConfig() | ||
c = createDefaultConfig(model) | ||
c.Port = 8002 | ||
c.Model = model | ||
c.ServedModelNames = []string{"alias1", "alias2"} | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.MaxNumSeqs = 5 | ||
c.TimeToFirstToken = 2 | ||
c.InterTokenLatency = 1 | ||
c.Seed = 100 | ||
c.LoraModules = []loraModule{{Name: "lora3", Path: "/path/to/lora3"}, {Name: "lora4", Path: "/path/to/lora4"}} | ||
c.LoraModulesString = []string{ | ||
|
@@ -118,16 +131,8 @@ var _ = Describe("Simulator configuration", func() { | |
tests = append(tests, test) | ||
|
||
// Config from config.yaml file plus command line args with different format | ||
c = newConfig() | ||
c = createDefaultConfig(model) | ||
c.Port = 8002 | ||
c.Model = model | ||
c.ServedModelNames = []string{c.Model} | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.MaxNumSeqs = 5 | ||
c.TimeToFirstToken = 2 | ||
c.InterTokenLatency = 1 | ||
c.Seed = seedInConfigFile | ||
c.LoraModules = []loraModule{{Name: "lora3", Path: "/path/to/lora3"}} | ||
c.LoraModulesString = []string{ | ||
"{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}", | ||
|
@@ -143,16 +148,8 @@ var _ = Describe("Simulator configuration", func() { | |
tests = append(tests, test) | ||
|
||
// Config from config.yaml file plus command line args with empty string | ||
c = newConfig() | ||
c = createDefaultConfig(model) | ||
c.Port = 8002 | ||
c.Model = model | ||
c.ServedModelNames = []string{c.Model} | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.MaxNumSeqs = 5 | ||
c.TimeToFirstToken = 2 | ||
c.InterTokenLatency = 1 | ||
c.Seed = seedInConfigFile | ||
c.LoraModules = []loraModule{{Name: "lora3", Path: "/path/to/lora3"}} | ||
c.LoraModulesString = []string{ | ||
"{\"name\":\"lora3\",\"path\":\"/path/to/lora3\"}", | ||
|
@@ -168,18 +165,10 @@ var _ = Describe("Simulator configuration", func() { | |
tests = append(tests, test) | ||
|
||
// Config from config.yaml file plus command line args with empty string for loras | ||
c = newConfig() | ||
c = createDefaultConfig(qwenModelName) | ||
c.Port = 8001 | ||
c.Model = qwenModelName | ||
c.ServedModelNames = []string{"model1", "model2"} | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.MaxNumSeqs = 5 | ||
c.TimeToFirstToken = 2 | ||
c.InterTokenLatency = 1 | ||
c.LoraModules = []loraModule{} | ||
c.LoraModulesString = []string{} | ||
c.Seed = seedInConfigFile | ||
test = testCase{ | ||
name: "config file with command line args with empty string for loras", | ||
args: []string{"cmd", "--config", "../../manifests/config.yaml", "--lora-modules", ""}, | ||
|
@@ -188,25 +177,28 @@ var _ = Describe("Simulator configuration", func() { | |
tests = append(tests, test) | ||
|
||
// Config from config.yaml file plus command line args with empty parameter for loras | ||
c = newConfig() | ||
c = createDefaultConfig(qwenModelName) | ||
c.Port = 8001 | ||
c.Model = qwenModelName | ||
c.ServedModelNames = []string{"model1", "model2"} | ||
c.MaxLoras = 2 | ||
c.MaxCPULoras = 5 | ||
c.MaxNumSeqs = 5 | ||
c.TimeToFirstToken = 2 | ||
c.InterTokenLatency = 1 | ||
c.LoraModules = []loraModule{} | ||
c.LoraModulesString = []string{} | ||
c.Seed = seedInConfigFile | ||
test = testCase{ | ||
name: "config file with command line args with empty parameter for loras", | ||
args: []string{"cmd", "--config", "../../manifests/config.yaml", "--lora-modules"}, | ||
expectedConfig: c, | ||
} | ||
tests = append(tests, test) | ||
|
||
// Config from config.yaml file plus command line args with time to copy cache | ||
c = createDefaultBasicConfig(qwenModelName) | ||
c.Port = 8001 | ||
c.KVCacheTransferLatency = 50 | ||
test = testCase{ | ||
name: "config file with command line args with time to transfer kv-cache", | ||
args: []string{"cmd", "--config", "../../manifests/basic-config.yaml", "--kv-cache-transfer-latency", "50"}, | ||
expectedConfig: c, | ||
} | ||
tests = append(tests, test) | ||
|
||
// Invalid configurations | ||
test = testCase{ | ||
name: "invalid model", | ||
|
@@ -258,14 +250,14 @@ var _ = Describe("Simulator configuration", func() { | |
Entry(tests[4].name, tests[4].args, tests[4].expectedConfig), | ||
Entry(tests[5].name, tests[5].args, tests[5].expectedConfig), | ||
Entry(tests[6].name, tests[6].args, tests[6].expectedConfig), | ||
Entry(tests[7].name, tests[7].args, tests[7].expectedConfig), | ||
) | ||
|
||
DescribeTable("invalid configurations", | ||
func(args []string) { | ||
_, err := createSimConfig(args) | ||
Expect(err).To(HaveOccurred()) | ||
}, | ||
Entry(tests[7].name, tests[7].args), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You removed a test here instead of only increasing the indices There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added test 13 |
||
Entry(tests[8].name, tests[8].args), | ||
Entry(tests[9].name, tests[9].args), | ||
Entry(tests[10].name, tests[10].args), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,6 +44,10 @@ type completionRequest interface { | |
getToolChoice() string | ||
// getMaxCompletionTokens returns the maximum completion tokens requested | ||
getMaxCompletionTokens() *int64 | ||
// isDoRemoteDecode() returns true is do_remote_decode is true in the request, this means that this is prefill request | ||
doRemoteDecode() bool | ||
// isDoRemotePrefill() returns true is do_remote_prefill is true in the request, this means that this is decode request | ||
doRemotePrefill() bool | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The names in the comments don't match the actual names There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
} | ||
|
||
// baseCompletionRequest contains base completion request related information | ||
|
@@ -53,7 +57,13 @@ type baseCompletionRequest struct { | |
// StreamOptions defines streaming options in case Stream is set to true | ||
StreamOptions streamOptions `json:"stream_options"` | ||
// Model defines Model name to use for "inference", could be base Model name or one of available LoRA adapters | ||
Model string `json:"model"` | ||
Model string `json:"model"` | ||
DoRemoteDecode bool `json:"do_remote_decode"` | ||
DoRemotePrefill bool `json:"do_remote_prefill"` | ||
RemoteBlockIds []string `json:"remote_block_ids"` | ||
RemoteEngineId string `json:"remote_engine_id"` | ||
RemoteHost string `json:"remote_host"` | ||
RemotePort int `json:"remote_port"` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider adding comments for the fields There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
} | ||
|
||
// StreamOptions defines streaming options for streaming requests | ||
|
@@ -74,6 +84,14 @@ func (b *baseCompletionRequest) includeUsage() bool { | |
return !b.Stream || b.StreamOptions.IncludeUsage | ||
} | ||
|
||
func (b *baseCompletionRequest) doRemoteDecode() bool { | ||
return b.DoRemoteDecode | ||
} | ||
|
||
func (b *baseCompletionRequest) doRemotePrefill() bool { | ||
return b.DoRemotePrefill | ||
} | ||
|
||
// completionReqCtx is a context passed in the simulator's flow, it contains the request data needed | ||
// to generate the simulator's response | ||
type completionReqCtx struct { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,7 +37,13 @@ type baseCompletionResponse struct { | |
// Usage contains the token usage statistics for the request | ||
Usage *usage `json:"usage"` | ||
// Object is the Object type, "text_completion", "chat.completion", or "chat.completion.chunk" | ||
Object string `json:"object"` | ||
Object string `json:"object"` | ||
DoRemoteDecode bool `json:"do_remote_decode"` | ||
DoRemotePrefill bool `json:"do_remote_prefill"` | ||
RemoteBlockIds []string `json:"remote_block_ids"` | ||
RemoteEngineId string `json:"remote_engine_id"` | ||
RemoteHost string `json:"remote_host"` | ||
RemotePort int `json:"remote_port"` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Likewise There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
} | ||
|
||
// usage contains token usage statistics | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
replace: is defined by `time-to-first-token
with: is defined by the
time-to-first-token
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done