You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: pkg/llm-d-inference-sim/simulator.go
+52-11Lines changed: 52 additions & 11 deletions
Original file line number
Diff line number
Diff line change
@@ -49,6 +49,7 @@ const (
49
49
stopFinishReason="stop"
50
50
lengthFinishReason="length"
51
51
toolsFinishReason="tool_calls"
52
+
remoteDecodeFinishReason="remote_decode"
52
53
roleAssistant="assistant"
53
54
roleUser="user"
54
55
textCompletionObject="text_completion"
@@ -155,6 +156,7 @@ func (s *VllmSimulator) parseCommandParamsAndLoadConfig() error {
155
156
f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
156
157
f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
157
158
f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
159
+
f.IntVar(&config.KVCacheTransferLatency, "kv_cache_transfer_latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
158
160
f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
159
161
160
162
// These values were manually parsed above in getParamValueFromArgs, we leave this in order to get these flags in --help
@@ -304,6 +306,8 @@ func (s *VllmSimulator) readRequest(ctx *fasthttp.RequestCtx, isChatCompletion b
304
306
varreqtextCompletionRequest
305
307
306
308
err:=json.Unmarshal(ctx.Request.Body(), &req)
309
+
310
+
fmt.Printf("Unmarshaled text request: %#v\n", req)
307
311
return&req, err
308
312
}
309
313
@@ -329,6 +333,18 @@ func (s *VllmSimulator) HandleUnloadLora(ctx *fasthttp.RequestCtx) {
func (s*VllmSimulator) getTimeToFirstToken(doRemotePrefillbool) int {
676
+
ifdoRemotePrefill {
677
+
returns.config.KVCacheTransferLatency
678
+
}
679
+
returns.config.TimeToFirstToken
680
+
}
681
+
641
682
// createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
0 commit comments