Skip to content

Commit 98b9585

Browse files
committed
fixes according PR's comments
Signed-off-by: Maya Barnea <mayab@il.ibm.com>
1 parent 626c432 commit 98b9585

File tree

5 files changed

+44
-39
lines changed

5 files changed

+44
-39
lines changed

pkg/llm-d-inference-sim/config_test.go

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,14 @@ func createSimConfig(args []string) (*configuration, error) {
4545
return s.config, nil
4646
}
4747

48-
func createDefaultBasicConfig(model string) *configuration {
48+
func createDefaultConfig(model string) *configuration {
4949
c := newConfig()
5050

5151
c.Model = model
5252
c.ServedModelNames = []string{c.Model}
5353
c.MaxNumSeqs = 5
54-
c.MaxLoras = 1
55-
c.MaxCPULoras = 1
54+
c.MaxLoras = 2
55+
c.MaxCPULoras = 5
5656
c.TimeToFirstToken = 2000
5757
c.InterTokenLatency = 1000
5858
c.KVCacheTransferLatency = 100
@@ -62,16 +62,6 @@ func createDefaultBasicConfig(model string) *configuration {
6262
return c
6363
}
6464

65-
func createDefaultConfig(model string) *configuration {
66-
c := createDefaultBasicConfig(model)
67-
68-
// parameters special to config.yaml
69-
c.MaxLoras = 2
70-
c.MaxCPULoras = 5
71-
72-
return c
73-
}
74-
7565
type testCase struct {
7666
name string
7767
args []string
@@ -189,8 +179,11 @@ var _ = Describe("Simulator configuration", func() {
189179
tests = append(tests, test)
190180

191181
// Config from config.yaml file plus command line args with time to copy cache
192-
c = createDefaultBasicConfig(qwenModelName)
182+
c = createDefaultConfig(qwenModelName)
193183
c.Port = 8001
184+
// basic config file does not contain properties related to lora
185+
c.MaxLoras = 1
186+
c.MaxCPULoras = 1
194187
c.KVCacheTransferLatency = 50
195188
test = testCase{
196189
name: "config file with command line args with time to transfer kv-cache",
@@ -263,5 +256,6 @@ var _ = Describe("Simulator configuration", func() {
263256
Entry(tests[10].name, tests[10].args),
264257
Entry(tests[11].name, tests[11].args),
265258
Entry(tests[12].name, tests[12].args),
259+
Entry(tests[13].name, tests[13].args),
266260
)
267261
})

pkg/llm-d-inference-sim/request.go

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ type completionRequest interface {
4444
getToolChoice() string
4545
// getMaxCompletionTokens returns the maximum completion tokens requested
4646
getMaxCompletionTokens() *int64
47-
// isDoRemoteDecode() returns true is do_remote_decode is true in the request, this means that this is prefill request
47+
// doRemoteDecode() returns true if do_remote_decode field is true in the request, this means that this is prefill request
4848
doRemoteDecode() bool
49-
// isDoRemotePrefill() returns true is do_remote_prefill is true in the request, this means that this is decode request
49+
// doRemotePrefill() returns true if do_remote_prefill field is true in the request, this means that this is decode request
5050
doRemotePrefill() bool
5151
}
5252

@@ -57,13 +57,19 @@ type baseCompletionRequest struct {
5757
// StreamOptions defines streaming options in case Stream is set to true
5858
StreamOptions streamOptions `json:"stream_options"`
5959
// Model defines Model name to use for "inference", could be base Model name or one of available LoRA adapters
60-
Model string `json:"model"`
61-
DoRemoteDecode bool `json:"do_remote_decode"`
62-
DoRemotePrefill bool `json:"do_remote_prefill"`
63-
RemoteBlockIds []string `json:"remote_block_ids"`
64-
RemoteEngineId string `json:"remote_engine_id"`
65-
RemoteHost string `json:"remote_host"`
66-
RemotePort int `json:"remote_port"`
60+
Model string `json:"model"`
61+
// DoRemoteDecode boolean value, true when request's decode will be done on remote pod
62+
DoRemoteDecode bool `json:"do_remote_decode"`
63+
// DoRemotePrefill boolean value, true when request's prefill was done on remote pod
64+
DoRemotePrefill bool `json:"do_remote_prefill"`
65+
// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
66+
RemoteBlockIds []string `json:"remote_block_ids"`
67+
// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
68+
RemoteEngineId string `json:"remote_engine_id"`
69+
// RemoteHost is a hostname or IP address of the remote server handling prefill
70+
RemoteHost string `json:"remote_host"`
71+
// RemotePort is a port of the remote server handling prefill
72+
RemotePort int `json:"remote_port"`
6773
}
6874

6975
// StreamOptions defines streaming options for streaming requests

pkg/llm-d-inference-sim/response.go

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,17 @@ type baseCompletionResponse struct {
3737
// Usage contains the token usage statistics for the request
3838
Usage *usage `json:"usage"`
3939
// Object is the Object type, "text_completion", "chat.completion", or "chat.completion.chunk"
40-
Object string `json:"object"`
41-
DoRemoteDecode bool `json:"do_remote_decode"`
42-
DoRemotePrefill bool `json:"do_remote_prefill"`
43-
RemoteBlockIds []string `json:"remote_block_ids"`
44-
RemoteEngineId string `json:"remote_engine_id"`
45-
RemoteHost string `json:"remote_host"`
46-
RemotePort int `json:"remote_port"`
40+
Object string `json:"object"`
41+
DoRemoteDecode bool `json:"do_remote_decode"`
42+
DoRemotePrefill bool `json:"do_remote_prefill"`
43+
// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
44+
RemoteBlockIds []string `json:"remote_block_ids"`
45+
// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
46+
RemoteEngineId string `json:"remote_engine_id"`
47+
// RemoteHost is a hostname or IP address of the remote server handling prefill
48+
RemoteHost string `json:"remote_host"`
49+
// RemotePort is a port of the remote server handling prefill
50+
RemotePort int `json:"remote_port"`
4751
}
4852

4953
// usage contains token usage statistics

pkg/llm-d-inference-sim/simulator.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -303,11 +303,10 @@ func (s *VllmSimulator) readRequest(ctx *fasthttp.RequestCtx, isChatCompletion b
303303

304304
return &req, nil
305305
}
306-
var req textCompletionRequest
307306

307+
var req textCompletionRequest
308308
err := json.Unmarshal(ctx.Request.Body(), &req)
309309

310-
fmt.Printf("Unmarshaled text request: %#v\n", req)
311310
return &req, err
312311
}
313312

@@ -490,8 +489,9 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
490489
ctx: reqCtx.httpReqCtx,
491490
isChatCompletion: reqCtx.isChatCompletion,
492491
model: displayModel,
492+
doRemotePrefill: req.doRemotePrefill(),
493493
},
494-
responseTokens, toolCalls, finishReason, usageDataToSend, req.doRemotePrefill(),
494+
responseTokens, toolCalls, finishReason, usageDataToSend,
495495
)
496496
} else {
497497
if req.doRemoteDecode() {
@@ -671,7 +671,7 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
671671
s.responseSentCallback(modelName)
672672
}
673673

674-
// returns time to first token based on whether
674+
// returns time to first token based on the current request's doRemotePrefill
675675
func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool) int {
676676
if doRemotePrefill {
677677
return s.config.KVCacheTransferLatency

pkg/llm-d-inference-sim/streaming.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,14 +31,15 @@ type streamingContext struct {
3131
isChatCompletion bool
3232
model string
3333
creationTime int64
34+
doRemotePrefill bool
3435
}
3536

3637
// sendStreamingResponse creates and sends a streaming response for completion requests of both types (text and chat)
3738
// as defined by isChatCompletion
3839
// response content is wrapped according SSE format
3940
// First token is send after timeToFirstToken milliseconds, every other token is sent after interTokenLatency milliseconds
4041
func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, responseTokens []string, toolCalls []toolCall,
41-
finishReason string, usageData *usage, doRemotePrefill bool) {
42+
finishReason string, usageData *usage) {
4243
context.ctx.SetContentType("text/event-stream")
4344
context.ctx.SetStatusCode(fasthttp.StatusOK)
4445

@@ -57,11 +58,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
5758
if len(toolCalls) > 0 {
5859
s.logger.Info("Going to send tools calls")
5960
for _, tc := range toolCalls {
60-
s.sendTokenChunks(context, w, tc.Function.tokenizedArguments, &tc, finishReason, doRemotePrefill)
61+
s.sendTokenChunks(context, w, tc.Function.tokenizedArguments, &tc, finishReason)
6162
}
6263
} else {
6364
s.logger.Info("Going to send text", "number of tokens", usageData.CompletionTokens)
64-
s.sendTokenChunks(context, w, responseTokens, nil, finishReason, doRemotePrefill)
65+
s.sendTokenChunks(context, w, responseTokens, nil, finishReason)
6566
}
6667
}
6768

@@ -84,9 +85,9 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
8485
}
8586

8687
// sendTokenChunks creates and sends response chunks
87-
func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, tokens []string, tc *toolCall, finishReason string, doRemotePrefill bool) {
88+
func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, tokens []string, tc *toolCall, finishReason string) {
8889
// time to first token delay
89-
time.Sleep(time.Duration(s.getTimeToFirstToken(doRemotePrefill)) * time.Millisecond)
90+
time.Sleep(time.Duration(s.getTimeToFirstToken(context.doRemotePrefill)) * time.Millisecond)
9091

9192
for i, token := range tokens {
9293
if i != 0 {

0 commit comments

Comments
 (0)