llm-d
diff --git a/‎go.mod
Lines changed: 5 additions & 0 deletions b/‎go.mod
Lines changed: 5 additions & 0 deletions
diff --git a/‎go.sum
Lines changed: 12 additions & 0 deletions b/‎go.sum
Lines changed: 12 additions & 0 deletions
diff --git a/‎pkg/vllm-sim/defs.go
Lines changed: 56 additions & 12 deletions b/‎pkg/vllm-sim/defs.go
Lines changed: 56 additions & 12 deletions
diff --git a/‎pkg/vllm-sim/metrics.go
Lines changed: 10 additions & 6 deletions b/‎pkg/vllm-sim/metrics.go
Lines changed: 10 additions & 6 deletions
diff --git a/‎pkg/vllm-sim/simulator.go
Lines changed: 47 additions & 22 deletions b/‎pkg/vllm-sim/simulator.go
Lines changed: 47 additions & 22 deletions
@@ -10,6 +10,7 @@ require (
 	github.com/google/uuid v1.6.0
 	github.com/onsi/ginkgo/v2 v2.23.4
 	github.com/onsi/gomega v1.37.0
+	github.com/openai/openai-go v0.1.0-beta.10
 	github.com/prometheus/client_golang v1.21.1
 	github.com/spf13/pflag v1.0.6
 	github.com/valyala/fasthttp v1.59.0
@@ -30,6 +31,10 @@ require (
 	github.com/prometheus/client_model v0.6.1 // indirect
 	github.com/prometheus/common v0.62.0 // indirect
 	github.com/prometheus/procfs v0.15.1 // indirect
+	github.com/tidwall/gjson v1.18.0 // indirect
+	github.com/tidwall/match v1.1.1 // indirect
+	github.com/tidwall/pretty v1.2.1 // indirect
+	github.com/tidwall/sjson v1.2.5 // indirect
 	github.com/valyala/bytebufferpool v1.0.0 // indirect
 	go.uber.org/automaxprocs v1.6.0 // indirect
 	golang.org/x/net v0.38.0 // indirect
 
@@ -32,6 +32,8 @@ github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus
 github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8=
 github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y=
 github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0=
+github.com/openai/openai-go v0.1.0-beta.10 h1:CknhGXe8aXQMRuqg255PFnWzgRY9nEryMxoNIBBM9tU=
+github.com/openai/openai-go v0.1.0-beta.10/go.mod h1:g461MYGXEXBVdV5SaR/5tNzNbSfwTBBefwc+LlDCK0Y=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g=
@@ -50,6 +52,16 @@ github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o=
 github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
 github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
 github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
+github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
+github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
+github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
+github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
+github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
 github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
 github.com/valyala/fasthttp v1.59.0 h1:Qu0qYHfXvPk1mSLNqcFtEk6DpxgA26hy6bmydotDpRI=
 
@@ -19,6 +19,7 @@ limitations under the License.
 package vllmsim
 
 import (
+	"fmt"
 	"sync"
 
 	"github.com/go-logr/logr"
@@ -45,7 +46,7 @@ type VllmSimulator struct {
 	interTokenLatency int
 	// port defines on which port the simulator runs
 	port int
-	// mode defenes the simulator response generation mode, valid values: echo, random
+	// mode defines the simulator response generation mode, valid values: echo, random
 	mode string
 	// model defines the current base model name
 	model string
@@ -118,7 +119,7 @@ func (b *baseCompletionRequest) getModel() string {
 // completionRequest interface representing both completion request types (text and chat)
 type completionRequest interface {
 	// createResponseText creates and returns response payload based on this request
-	createResponseText(mode string) string
+	createResponseText(mode string) (string, error)
 	// isStream returns boolean that defines is response should be streamed
 	isStream() bool
 	// getModel returns model name as defined in the request
@@ -146,6 +147,18 @@ type chatCompletionRequest struct {
 	baseCompletionRequest
 	// Messages list of request's Messages
 	Messages []message `json:"messages"`
+
+	// The maximum number of tokens that can be generated in the chat
+	// completion. This value can be used to control costs for text
+	// generated via API.
+	// This value is now deprecated in favor of max_completion_tokens
+	// and is not compatible with o1 series models.
+	MaxTokens *int64 `json:"max_tokens"`
+
+	// An upper bound for the number of tokens that can be
+	// generated for a completion, including visible output
+	// tokens and reasoning tokens.
+	MaxCompletionTokens *int64 `json:"max_completion_tokens"`
 }
 
 // chatCompletionResponse defines structure of /chat/completion response
@@ -168,9 +181,13 @@ type textCompletionRequest struct {
 	baseCompletionRequest
 	// Prompt defines request's content
 	Prompt string `json:"prompt"`
-	// TODO - do we want to support max tokens?
-	// MaxTokens is a maximum number of tokens in response
-	MaxTokens int `json:"max_tokens"`
+
+	// The maximum number of [tokens](/tokenizer) that can be generated in the
+	// completion.
+	//
+	// The token count of your prompt plus `max_tokens` cannot exceed the model's
+	// context length.
+	MaxTokens *int64 `json:"max_tokens"`
 }
 
 // textCompletionResponse defines structure of /completion response
@@ -204,21 +221,48 @@ type chatRespChunkChoice struct {
 	Delta message `json:"delta"`
 }
 
+// returns the max. tokens or error if incorrect
+func getMaxTokens(maxCompletionTokens *int64, maxTokens *int64) (*int64, error) {
+	var typeToken string
+	var tokens *int64
+	// if both arguments are passed,
+	// use maxCompletionTokens
+	// as in the real vllm
+	if maxCompletionTokens != nil {
+		tokens = maxCompletionTokens
+		typeToken = "max_completion_tokens"
+	} else if maxTokens != nil {
+		tokens = maxTokens
+		typeToken = "max_tokens"
+	}
+	if tokens != nil && *tokens < 1 {
+		return nil, fmt.Errorf("%s must be at least 1, got %d", typeToken, *tokens)
+	}
+	return tokens, nil
+}
+
 // createResponseText creates response text for the given chat completion request and mode
-func (req chatCompletionRequest) createResponseText(mode string) string {
+func (req chatCompletionRequest) createResponseText(mode string) (string, error) {
+	maxTokens, err := getMaxTokens(req.MaxCompletionTokens, req.MaxTokens)
+	if err != nil {
+		return "", err
+	}
 	if mode == modeEcho {
-		return req.getLastUserMsg()
+		return getResponseText(maxTokens, req.getLastUserMsg()), nil
 	}
-	return getRandomResponseText()
+	return getRandomResponseText(maxTokens), nil
 }
 
 // createResponseText creates response text for the given text completion request and mode
-func (req textCompletionRequest) createResponseText(mode string) string {
+func (req textCompletionRequest) createResponseText(mode string) (string, error) {
+	maxTokens, err := getMaxTokens(nil, req.MaxTokens)
+	if err != nil {
+		return "", err
+	}
 	if mode == modeEcho {
-		return req.Prompt
-	} else {
-		return getRandomResponseText()
+		return getResponseText(maxTokens, req.Prompt), nil
 	}
+	return getRandomResponseText(maxTokens), nil
 }
 
 // getLastUserMsg returns last message from this request's messages with user role,
 
@@ -132,14 +132,18 @@ func (s *VllmSimulator) reportLoras() {
 
 // reportRunningRequests sets information about running completion requests
 func (s *VllmSimulator) reportRunningRequests() {
-	nRunningReqs := atomic.LoadInt64(&(s.nRunningReqs))
-	s.runningRequests.WithLabelValues(
-		s.model).Set(float64(nRunningReqs))
+	if s.runningRequests != nil {
+		nRunningReqs := atomic.LoadInt64(&(s.nRunningReqs))
+		s.runningRequests.WithLabelValues(
+			s.model).Set(float64(nRunningReqs))
+	}
 }
 
 // reportWaitingRequests sets information about waiting completion requests
 func (s *VllmSimulator) reportWaitingRequests() {
-	nWaitingReqs := atomic.LoadInt64(&(s.nWaitingReqs))
-	s.waitingRequests.WithLabelValues(
-		s.model).Set(float64(nWaitingReqs))
+	if s.waitingRequests != nil {
+		nWaitingReqs := atomic.LoadInt64(&(s.nWaitingReqs))
+		s.waitingRequests.WithLabelValues(
+			s.model).Set(float64(nWaitingReqs))
+	}
 }
@@ -22,6 +22,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"net"
+	"os"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -37,6 +38,8 @@ import (
 	"github.com/valyala/fasthttp/fasthttpadaptor"
 )
 
+const vLLMDefaultPort = 8000
+
 // New creates a new VllmSimulator instance with the given logger
 func New(logger logr.Logger) *VllmSimulator {
 	return &VllmSimulator{
@@ -63,24 +66,32 @@ func (s *VllmSimulator) Start(ctx context.Context) error {
 	for i := 1; i <= int(s.maxRunningReqs); i++ {
 		go s.reqProcessingWorker(ctx, i)
 	}
+	listener, err := s.newListener()
+	if err != nil {
+		return err
+	}
+
 	// start the http server
-	return s.startServer()
+	return s.startServer(listener)
 }
 
 // parseCommandParams parses and validates command line parameters
 func (s *VllmSimulator) parseCommandParams() error {
-	pflag.StringVar(&s.mode, "mode", "random", "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
-	pflag.IntVar(&s.port, "port", 0, "Port")
-	pflag.IntVar(&s.interTokenLatency, "inter-token-latency", 0, "Time to generate one token (in milliseconds)")
-	pflag.IntVar(&s.timeToFirstToken, "time-to-first-token", 0, "Time to first token (in milliseconds)")
-	pflag.StringVar(&s.model, "model", "", "Currently 'loaded' model")
+	f := pflag.NewFlagSet("vllm-sim flags", pflag.ExitOnError)
+	f.StringVar(&s.mode, "mode", "random", "Simulator mode, echo - returns the same text that was sent in the request, for chat completion returns the last message, random - returns random sentence from a bank of pre-defined sentences")
+	f.IntVar(&s.port, "port", vLLMDefaultPort, "Port")
+	f.IntVar(&s.interTokenLatency, "inter-token-latency", 0, "Time to generate one token (in milliseconds)")
+	f.IntVar(&s.timeToFirstToken, "time-to-first-token", 0, "Time to first token (in milliseconds)")
+	f.StringVar(&s.model, "model", "", "Currently 'loaded' model")
 	var lorasStr string
-	pflag.StringVar(&lorasStr, "lora", "", "List of LoRA adapters, separated by comma")
-	pflag.IntVar(&s.maxLoras, "max-loras", 1, "Maximum number of LoRAs in a single batch")
-	pflag.IntVar(&s.maxCpuLoras, "max-cpu-loras", 0, "Maximum number of LoRAs to store in CPU memory")
-	pflag.Int64Var(&s.maxRunningReqs, "max-running-requests", 5, "Maximum number of inference requests that could be processed at the same time (parameter to simulate requests waiting queue)")
+	f.StringVar(&lorasStr, "lora", "", "List of LoRA adapters, separated by comma")
+	f.IntVar(&s.maxLoras, "max-loras", 1, "Maximum number of LoRAs in a single batch")
+	f.IntVar(&s.maxCpuLoras, "max-cpu-loras", 0, "Maximum number of LoRAs to store in CPU memory")
+	f.Int64Var(&s.maxRunningReqs, "max-running-requests", 5, "Maximum number of inference requests that could be processed at the same time (parameter to simulate requests waiting queue)")
 
-	pflag.Parse()
+	if err := f.Parse(os.Args[1:]); err != nil {
+		return err
+	}
 
 	loras := strings.Split(lorasStr, ",")
 	for _, lora := range loras {
@@ -120,8 +131,17 @@ func (s *VllmSimulator) parseCommandParams() error {
 	return nil
 }
 
+func (s *VllmSimulator) newListener() (net.Listener, error) {
+	s.logger.Info("Server starting", "port", s.port)
+	listener, err := net.Listen("tcp4", fmt.Sprintf(":%d", s.port))
+	if err != nil {
+		return nil, err
+	}
+	return listener, nil
+}
+
 // startServer starts http server on port defined in command line
-func (s *VllmSimulator) startServer() error {
+func (s *VllmSimulator) startServer(listener net.Listener) error {
 	r := fasthttprouter.New()
 
 	// support completion APIs
@@ -141,11 +161,6 @@ func (s *VllmSimulator) startServer() error {
 		Logger:       s,
 	}
 
-	s.logger.Info("Server starting", "port", s.port)
-	listener, err := net.Listen("tcp4", fmt.Sprintf(":%d", s.port))
-	if err != nil {
-		return err
-	}
 	defer func() {
 		if err := listener.Close(); err != nil {
 			s.logger.Error(err, "server listener close failed")
@@ -308,12 +323,22 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
 			atomic.AddInt64(&(s.nRunningReqs), 1)
 			s.reportRunningRequests()
 
-			responseTxt := req.createResponseText(s.mode)
-
-			if req.isStream() {
-				s.sendStreamingResponse(reqCtx.isChatCompletion, reqCtx.httpReqCtx, responseTxt, model)
+			responseTxt, err := req.createResponseText(s.mode)
+			if err != nil {
+				prefix := ""
+				if reqCtx.isChatCompletion {
+					prefix = "failed to create chat response"
+				} else {
+					prefix = "failed to create text response"
+				}
+				s.logger.Error(err, prefix)
+				reqCtx.httpReqCtx.Error(prefix+err.Error(), fasthttp.StatusBadRequest)
 			} else {
-				s.sendResponse(reqCtx.isChatCompletion, reqCtx.httpReqCtx, responseTxt, model)
+				if req.isStream() {
+					s.sendStreamingResponse(reqCtx.isChatCompletion, reqCtx.httpReqCtx, responseTxt, model)
+				} else {
+					s.sendResponse(reqCtx.isChatCompletion, reqCtx.httpReqCtx, responseTxt, model)
+				}
 			}
 			reqCtx.wg.Done()
 		}