fixes according PR's comments

mayabar · mayabar · commit 98b9585b3760 · 2025-07-15T09:17:23.000+03:00
Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;
diff --git a/pkg/llm-d-inference-sim/config_test.go b/pkg/llm-d-inference-sim/config_test.go
@@ -45,14 +45,14 @@ func createSimConfig(args []string) (*configuration, error) {
 	return s.config, nil
 }
 
-func createDefaultBasicConfig(model string) *configuration {
+func createDefaultConfig(model string) *configuration {
 	c := newConfig()
 
 	c.Model = model
 	c.ServedModelNames = []string{c.Model}
 	c.MaxNumSeqs = 5
-	c.MaxLoras = 1
-	c.MaxCPULoras = 1
+	c.MaxLoras = 2
+	c.MaxCPULoras = 5
 	c.TimeToFirstToken = 2000
 	c.InterTokenLatency = 1000
 	c.KVCacheTransferLatency = 100
@@ -62,16 +62,6 @@ func createDefaultBasicConfig(model string) *configuration {
 	return c
 }
 
-func createDefaultConfig(model string) *configuration {
-	c := createDefaultBasicConfig(model)
-
-	// parameters special to config.yaml
-	c.MaxLoras = 2
-	c.MaxCPULoras = 5
-
-	return c
-}
-
 type testCase struct {
 	name           string
 	args           []string
@@ -189,8 +179,11 @@ var _ = Describe("Simulator configuration", func() {
 	tests = append(tests, test)
 
 	// Config from config.yaml file plus command line args with time to copy cache
-	c = createDefaultBasicConfig(qwenModelName)
+	c = createDefaultConfig(qwenModelName)
 	c.Port = 8001
+	// basic config file does not contain properties related to lora
+	c.MaxLoras = 1
+	c.MaxCPULoras = 1
 	c.KVCacheTransferLatency = 50
 	test = testCase{
 		name:           "config file with command line args with time to transfer kv-cache",
@@ -263,5 +256,6 @@ var _ = Describe("Simulator configuration", func() {
 		Entry(tests[10].name, tests[10].args),
 		Entry(tests[11].name, tests[11].args),
 		Entry(tests[12].name, tests[12].args),
+		Entry(tests[13].name, tests[13].args),
 	)
 })
diff --git a/pkg/llm-d-inference-sim/request.go b/pkg/llm-d-inference-sim/request.go
@@ -44,9 +44,9 @@ type completionRequest interface {
 	getToolChoice() string
 	// getMaxCompletionTokens returns the maximum completion tokens requested
 	getMaxCompletionTokens() *int64
-	// isDoRemoteDecode() returns true is do_remote_decode is true in the request, this means that this is prefill request
+	// doRemoteDecode() returns true if do_remote_decode field is true in the request, this means that this is prefill request
 	doRemoteDecode() bool
-	// isDoRemotePrefill() returns true is do_remote_prefill is true in the request, this means that this is decode request
+	// doRemotePrefill() returns true if do_remote_prefill field is true in the request, this means that this is decode request
 	doRemotePrefill() bool
 }
 
@@ -57,13 +57,19 @@ type baseCompletionRequest struct {
 	// StreamOptions defines streaming options in case Stream is set to true
 	StreamOptions streamOptions `json:"stream_options"`
 	// Model defines Model name to use for "inference", could be base Model name or one of available LoRA adapters
-	Model           string   `json:"model"`
-	DoRemoteDecode  bool     `json:"do_remote_decode"`
-	DoRemotePrefill bool     `json:"do_remote_prefill"`
-	RemoteBlockIds  []string `json:"remote_block_ids"`
-	RemoteEngineId  string   `json:"remote_engine_id"`
-	RemoteHost      string   `json:"remote_host"`
-	RemotePort      int      `json:"remote_port"`
+	Model string `json:"model"`
+	// DoRemoteDecode boolean value, true when request's decode will be done on remote pod
+	DoRemoteDecode bool `json:"do_remote_decode"`
+	// DoRemotePrefill boolean value, true when request's prefill was done on remote pod
+	DoRemotePrefill bool `json:"do_remote_prefill"`
+	// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
+	RemoteBlockIds []string `json:"remote_block_ids"`
+	// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
+	RemoteEngineId string `json:"remote_engine_id"`
+	// RemoteHost is a hostname or IP address of the remote server handling prefill
+	RemoteHost string `json:"remote_host"`
+	// RemotePort is a port of the remote server handling prefill
+	RemotePort int `json:"remote_port"`
 }
 
 // StreamOptions defines streaming options for streaming requests
diff --git a/pkg/llm-d-inference-sim/response.go b/pkg/llm-d-inference-sim/response.go
@@ -37,13 +37,17 @@ type baseCompletionResponse struct {
 	// Usage contains the token usage statistics for the request
 	Usage *usage `json:"usage"`
 	// Object is the Object type, "text_completion", "chat.completion", or "chat.completion.chunk"
-	Object          string   `json:"object"`
-	DoRemoteDecode  bool     `json:"do_remote_decode"`
-	DoRemotePrefill bool     `json:"do_remote_prefill"`
-	RemoteBlockIds  []string `json:"remote_block_ids"`
-	RemoteEngineId  string   `json:"remote_engine_id"`
-	RemoteHost      string   `json:"remote_host"`
-	RemotePort      int      `json:"remote_port"`
+	Object          string `json:"object"`
+	DoRemoteDecode  bool   `json:"do_remote_decode"`
+	DoRemotePrefill bool   `json:"do_remote_prefill"`
+	// RemoteBlockIds is a list of block identifiers to process remotely for distributed decoding
+	RemoteBlockIds []string `json:"remote_block_ids"`
+	// RemoteEngineId is an identifier of the remote inference engine or backend to use for processing requests
+	RemoteEngineId string `json:"remote_engine_id"`
+	// RemoteHost is a hostname or IP address of the remote server handling prefill
+	RemoteHost string `json:"remote_host"`
+	// RemotePort is a port of the remote server handling prefill
+	RemotePort int `json:"remote_port"`
 }
 
 // usage contains token usage statistics
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -303,11 +303,10 @@ func (s *VllmSimulator) readRequest(ctx *fasthttp.RequestCtx, isChatCompletion b
 
 		return &req, nil
 	}
-	var req textCompletionRequest
 
+	var req textCompletionRequest
 	err := json.Unmarshal(ctx.Request.Body(), &req)
 
-	fmt.Printf("Unmarshaled text request: %#v\n", req)
 	return &req, err
 }
 
@@ -490,8 +489,9 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
 							ctx:              reqCtx.httpReqCtx,
 							isChatCompletion: reqCtx.isChatCompletion,
 							model:            displayModel,
+							doRemotePrefill:  req.doRemotePrefill(),
 						},
-						responseTokens, toolCalls, finishReason, usageDataToSend, req.doRemotePrefill(),
+						responseTokens, toolCalls, finishReason, usageDataToSend,
 					)
 				} else {
 					if req.doRemoteDecode() {
@@ -671,7 +671,7 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 	s.responseSentCallback(modelName)
 }
 
-// returns time to first token based on whether
+// returns time to first token based on the current request's doRemotePrefill
 func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool) int {
 	if doRemotePrefill {
 		return s.config.KVCacheTransferLatency
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
@@ -31,14 +31,15 @@ type streamingContext struct {
 	isChatCompletion bool
 	model            string
 	creationTime     int64
+	doRemotePrefill  bool
 }
 
 // sendStreamingResponse creates and sends a streaming response for completion requests of both types (text and chat)
 // as defined by isChatCompletion
 // response content is wrapped according SSE format
 // First token is send after timeToFirstToken milliseconds, every other token is sent after interTokenLatency milliseconds
 func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, responseTokens []string, toolCalls []toolCall,
-	finishReason string, usageData *usage, doRemotePrefill bool) {
+	finishReason string, usageData *usage) {
 	context.ctx.SetContentType("text/event-stream")
 	context.ctx.SetStatusCode(fasthttp.StatusOK)
 
@@ -57,11 +58,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 			if len(toolCalls) > 0 {
 				s.logger.Info("Going to send tools calls")
 				for _, tc := range toolCalls {
-					s.sendTokenChunks(context, w, tc.Function.tokenizedArguments, &tc, finishReason, doRemotePrefill)
+					s.sendTokenChunks(context, w, tc.Function.tokenizedArguments, &tc, finishReason)
 				}
 			} else {
 				s.logger.Info("Going to send text", "number of tokens", usageData.CompletionTokens)
-				s.sendTokenChunks(context, w, responseTokens, nil, finishReason, doRemotePrefill)
+				s.sendTokenChunks(context, w, responseTokens, nil, finishReason)
 			}
 		}
 
@@ -84,9 +85,9 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 }
 
 // sendTokenChunks creates and sends response chunks
-func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, tokens []string, tc *toolCall, finishReason string, doRemotePrefill bool) {
+func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, tokens []string, tc *toolCall, finishReason string) {
 	// time to first token delay
-	time.Sleep(time.Duration(s.getTimeToFirstToken(doRemotePrefill)) * time.Millisecond)
+	time.Sleep(time.Duration(s.getTimeToFirstToken(context.doRemotePrefill)) * time.Millisecond)
 
 	for i, token := range tokens {
 		if i != 0 {