Add P/D support, respond accordingly to doRemotePrefill and doRemoteDecode fields

mayabar · mayabar · commit 10d5088e1bcf · 2025-07-14T16:08:17.000+03:00
Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;
diff --git a/pkg/llm-d-inference-sim/config.go b/pkg/llm-d-inference-sim/config.go
@@ -53,6 +53,9 @@ type configuration struct {
 	TimeToFirstToken int `yaml:"time-to-first-token"`
 	// InterTokenLatency time between generated tokens, in milliseconds
 	InterTokenLatency int `yaml:"inter-token-latency"`
+	// KVCacheTransferLatency time to "transfer" kv-cache from another vLLM instance in case P/D is activated, in milliseconds
+	KVCacheTransferLatency int `yaml:"kv_cache_transfer_latency"`
+
 	// Mode defines the simulator response generation mode, valid values: echo, random
 	Mode string `yaml:"mode"`
 	// Seed defines random seed for operations
@@ -145,6 +148,9 @@ func (c *configuration) validate() error {
 	if c.TimeToFirstToken < 0 {
 		return errors.New("time to first token cannot be negative")
 	}
+	if c.KVCacheTransferLatency < 0 {
+		return errors.New("kv-cache tranfer time cannot be negative")
+	}
 	if c.MaxLoras < 1 {
 		return errors.New("max LoRAs cannot be less than 1")
 	}
diff --git a/pkg/llm-d-inference-sim/request.go b/pkg/llm-d-inference-sim/request.go
@@ -44,6 +44,10 @@ type completionRequest interface {
 	getToolChoice() string
 	// getMaxCompletionTokens returns the maximum completion tokens requested
 	getMaxCompletionTokens() *int64
+	// isDoRemoteDecode() returns true is do_remote_decode is true in the request, this means that this is prefill request
+	doRemoteDecode() bool
+	// isDoRemotePrefill() returns true is do_remote_prefill is true in the request, this means that this is decode request
+	doRemotePrefill() bool
 }
 
 // baseCompletionRequest contains base completion request related information
@@ -53,7 +57,13 @@ type baseCompletionRequest struct {
 	// StreamOptions defines streaming options in case Stream is set to true
 	StreamOptions streamOptions `json:"stream_options"`
 	// Model defines Model name to use for "inference", could be base Model name or one of available LoRA adapters
-	Model string `json:"model"`
+	Model           string   `json:"model"`
+	DoRemoteDecode  bool     `json:"do_remote_decode"`
+	DoRemotePrefill bool     `json:"do_remote_prefill"`
+	RemoteBlockIds  []string `json:"remote_block_ids"`
+	RemoteEngineId  string   `json:"remote_engine_id"`
+	RemoteHost      string   `json:"remote_host"`
+	RemotePort      int      `json:"remote_port"`
 }
 
 // StreamOptions defines streaming options for streaming requests
@@ -74,6 +84,14 @@ func (b *baseCompletionRequest) includeUsage() bool {
 	return !b.Stream || b.StreamOptions.IncludeUsage
 }
 
+func (b *baseCompletionRequest) doRemoteDecode() bool {
+	return b.DoRemoteDecode
+}
+
+func (b *baseCompletionRequest) doRemotePrefill() bool {
+	return b.DoRemotePrefill
+}
+
 // completionReqCtx is a context passed in the simulator's flow, it contains the request data needed
 // to generate the simulator's response
 type completionReqCtx struct {
diff --git a/pkg/llm-d-inference-sim/response.go b/pkg/llm-d-inference-sim/response.go
@@ -37,7 +37,13 @@ type baseCompletionResponse struct {
 	// Usage contains the token usage statistics for the request
 	Usage *usage `json:"usage"`
 	// Object is the Object type, "text_completion", "chat.completion", or "chat.completion.chunk"
-	Object string `json:"object"`
+	Object          string   `json:"object"`
+	DoRemoteDecode  bool     `json:"do_remote_decode"`
+	DoRemotePrefill bool     `json:"do_remote_prefill"`
+	RemoteBlockIds  []string `json:"remote_block_ids"`
+	RemoteEngineId  string   `json:"remote_engine_id"`
+	RemoteHost      string   `json:"remote_host"`
+	RemotePort      int      `json:"remote_port"`
 }
 
 // usage contains token usage statistics
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -49,6 +49,7 @@ const (
 	stopFinishReason          = "stop"
 	lengthFinishReason        = "length"
 	toolsFinishReason         = "tool_calls"
+	remoteDecodeFinishReason  = "remote_decode"
 	roleAssistant             = "assistant"
 	roleUser                  = "user"
 	textCompletionObject      = "text_completion"
@@ -155,6 +156,7 @@ func (s *VllmSimulator) parseCommandParamsAndLoadConfig() error {
 	f.StringVar(&config.Mode, "mode", config.Mode, "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
 	f.IntVar(&config.InterTokenLatency, "inter-token-latency", config.InterTokenLatency, "Time to generate one token (in milliseconds)")
 	f.IntVar(&config.TimeToFirstToken, "time-to-first-token", config.TimeToFirstToken, "Time to first token (in milliseconds)")
+	f.IntVar(&config.KVCacheTransferLatency, "kv_cache_transfer_latency", config.KVCacheTransferLatency, "Time for KV-cache transfer from a remote vLLM (in milliseconds)")
 	f.Int64Var(&config.Seed, "seed", config.Seed, "Random seed for operations (if not set, current Unix time in nanoseconds is used)")
 
 	// These values were manually parsed above in getParamValueFromArgs, we leave this in order to get these flags in --help
@@ -304,6 +306,8 @@ func (s *VllmSimulator) readRequest(ctx *fasthttp.RequestCtx, isChatCompletion b
 	var req textCompletionRequest
 
 	err := json.Unmarshal(ctx.Request.Body(), &req)
+
+	fmt.Printf("Unmarshaled text request: %#v\n", req)
 	return &req, err
 }
 
@@ -329,6 +333,18 @@ func (s *VllmSimulator) HandleUnloadLora(ctx *fasthttp.RequestCtx) {
 	s.unloadLora(ctx)
 }
 
+func (s *VllmSimulator) validateRequest(req completionRequest) (string, string, int) {
+	if !s.isValidModel(req.getModel()) {
+		return fmt.Sprintf("The model `%s` does not exist.", req.getModel()), "NotFoundError", fasthttp.StatusNotFound
+	}
+
+	if req.doRemoteDecode() && req.isStream() {
+		return "Prefill does not support streaming", "Invalid request", fasthttp.StatusBadRequest
+	}
+
+	return "", "", fasthttp.StatusOK
+}
+
 // isValidModel checks if the given model is the base model or one of "loaded" LoRAs
 func (s *VllmSimulator) isValidModel(model string) bool {
 	for _, name := range s.config.ServedModelNames {
@@ -365,11 +381,9 @@ func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatComple
 		return
 	}
 
-	model := vllmReq.getModel()
-
-	if !s.isValidModel(model) {
-		s.sendCompletionError(ctx, fmt.Sprintf("The model `%s` does not exist.", vllmReq.getModel()),
-			"NotFoundError", fasthttp.StatusNotFound)
+	errMsg, errType, errCode := s.validateRequest(vllmReq)
+	if errMsg != "" {
+		s.sendCompletionError(ctx, errMsg, errType, errCode)
 		return
 	}
 
@@ -477,16 +491,23 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
 							isChatCompletion: reqCtx.isChatCompletion,
 							model:            displayModel,
 						},
-						responseTokens, toolCalls, finishReason, usageDataToSend,
+						responseTokens, toolCalls, finishReason, usageDataToSend, req.doRemotePrefill(),
 					)
 				} else {
+					if req.doRemoteDecode() {
+						// in case this is prefill pod processing, return special finish reason
+						finishReason = remoteDecodeFinishReason
+					}
+
 					s.sendResponse(reqCtx.isChatCompletion,
 						reqCtx.httpReqCtx,
 						responseTokens,
 						toolCalls,
 						displayModel,
 						finishReason,
-						&usageData)
+						&usageData,
+						req.doRemoteDecode(),
+						req.doRemotePrefill())
 				}
 			}
 			reqCtx.wg.Done()
@@ -575,13 +596,25 @@ func (s *VllmSimulator) HandleError(_ *fasthttp.RequestCtx, err error) {
 // modelName - display name returned to the client and used in metrics. It is either the first alias
 // from --served-model-name (for a base-model request) or the LoRA adapter name (for a LoRA request).
 func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respTokens []string, toolCalls []toolCall,
-	finishReason *string, usageData *usage, modelName string) completionResponse {
+	finishReason *string, usageData *usage, modelName string, doRemoteDecode bool) completionResponse {
 	baseResp := baseCompletionResponse{
 		ID:      chatComplIDPrefix + uuid.NewString(),
 		Created: time.Now().Unix(),
 		Model:   modelName,
 		Usage:   usageData,
 	}
+
+	if doRemoteDecode {
+		// add special fields related to the prefill pod special behavior
+		baseResp.DoRemoteDecode = true
+		baseResp.DoRemotePrefill = false
+		// currently remote prefill information is hard-coded
+		baseResp.RemoteBlockIds = []string{"DUMMY_ID"}
+		baseResp.RemoteEngineId = "DUMMY_ID"
+		baseResp.RemoteHost = "DUMMY"
+		baseResp.RemotePort = 1234
+	}
+
 	baseChoice := baseResponseChoice{Index: 0, FinishReason: finishReason}
 
 	respText := strings.Join(respTokens, "")
@@ -616,8 +649,8 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke
 // finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools
 // usageData - usage (tokens statistics) for this response
 func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.RequestCtx, respTokens []string, toolCalls []toolCall,
-	modelName string, finishReason string, usageData *usage) {
-	resp := s.createCompletionResponse(isChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName)
+	modelName string, finishReason string, usageData *usage, doRemoteDecode bool, doRemotePrefill bool) {
+	resp := s.createCompletionResponse(isChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName, doRemoteDecode)
 
 	data, err := json.Marshal(resp)
 	if err != nil {
@@ -627,7 +660,7 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 
 	// calculate how long to wait before returning the response, time is based on number of tokens
 	numOfTokens := usageData.CompletionTokens
-	totalMillisToWait := s.config.TimeToFirstToken + (numOfTokens-1)*s.config.InterTokenLatency
+	totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill) + (numOfTokens-1)*s.config.InterTokenLatency
 	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
 
 	// TODO - maybe add pod id to response header for testing
@@ -638,6 +671,14 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 	s.responseSentCallback(modelName)
 }
 
+// returns time to first token based on whether
+func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool) int {
+	if doRemotePrefill {
+		return s.config.KVCacheTransferLatency
+	}
+	return s.config.TimeToFirstToken
+}
+
 // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
 func (s *VllmSimulator) createModelsResponse() *vllmapi.ModelsResponse {
 	modelsResp := vllmapi.ModelsResponse{Object: "list", Data: []vllmapi.ModelsResponseModelInfo{}}
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
@@ -38,7 +38,7 @@ type streamingContext struct {
 // response content is wrapped according SSE format
 // First token is send after timeToFirstToken milliseconds, every other token is sent after interTokenLatency milliseconds
 func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, responseTokens []string, toolCalls []toolCall,
-	finishReason string, usageData *usage) {
+	finishReason string, usageData *usage, doRemotePrefill bool) {
 	context.ctx.SetContentType("text/event-stream")
 	context.ctx.SetStatusCode(fasthttp.StatusOK)
 
@@ -57,11 +57,11 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 			if len(toolCalls) > 0 {
 				s.logger.Info("Going to send tools calls")
 				for _, tc := range toolCalls {
-					s.sendTokenChunks(context, w, tc.Function.tokenizedArguments, &tc, finishReason)
+					s.sendTokenChunks(context, w, tc.Function.tokenizedArguments, &tc, finishReason, doRemotePrefill)
 				}
 			} else {
 				s.logger.Info("Going to send text", "number of tokens", usageData.CompletionTokens)
-				s.sendTokenChunks(context, w, responseTokens, nil, finishReason)
+				s.sendTokenChunks(context, w, responseTokens, nil, finishReason, doRemotePrefill)
 			}
 		}
 
@@ -84,9 +84,9 @@ func (s *VllmSimulator) sendStreamingResponse(context *streamingContext, respons
 }
 
 // sendTokenChunks creates and sends response chunks
-func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, tokens []string, tc *toolCall, finishReason string) {
+func (s *VllmSimulator) sendTokenChunks(context *streamingContext, w *bufio.Writer, tokens []string, tc *toolCall, finishReason string, doRemotePrefill bool) {
 	// time to first token delay
-	time.Sleep(time.Duration(s.config.TimeToFirstToken) * time.Millisecond)
+	time.Sleep(time.Duration(s.getTimeToFirstToken(doRemotePrefill)) * time.Millisecond)
 
 	for i, token := range tokens {
 		if i != 0 {