Limitation of number of concurrent running requests

dmitripikus · dmitripikus · commit 47adbd2e5465 · 2025-04-23T13:52:29.000+03:00
diff --git a/README.md b/README.md
@@ -81,6 +81,7 @@ For more details see the <a href="https://docs.vllm.ai/en/stable/getting_started
 - `inter-token-latency`: the time to 'generate' each additional token (in milliseconds), optional, by default zero
 - `max-loras`: maximum number of LoRAs in a single batch, optional, default is one
 - `max-cpu-loras`: maximum number of LoRAs to store in CPU memory, optional, must be >= than max_loras, default is max_loras
+- `max-running-requests`: maximum number of inference requests that could be processed at the same time
 
 
 ## Working with docker image
diff --git a/cmd/vllm-sim/main.go b/cmd/vllm-sim/main.go
@@ -31,12 +31,12 @@ func main() {
 	// setup logger and context with graceful shutdown
 	logger := klog.Background()
 	ctx := klog.NewContext(context.Background(), logger)
-	_ = signals.SetupSignalHandler(ctx)
+	ctx = signals.SetupSignalHandler(ctx)
 
 	logger.Info("Start vllm simulator")
 
 	vllmSim := vllmsim.New(logger)
-	err := vllmSim.Start()
+	err := vllmSim.Start(ctx)
 
 	if err != nil {
 		logger.Error(err, "VLLM simulator failed")
diff --git a/pkg/vllm-sim/defs.go b/pkg/vllm-sim/defs.go
@@ -23,6 +23,7 @@ import (
 
 	"github.com/go-logr/logr"
 	"github.com/prometheus/client_golang/prometheus"
+	"github.com/valyala/fasthttp"
 )
 
 const (
@@ -58,6 +59,9 @@ type VllmSimulator struct {
 	runningLoras sync.Map
 	// waitingLoras will represent collection of loras defined in requests in the queue - Not implemented yet
 	waitingLoras sync.Map
+	// maxRunningReqs defines the maximum number of inference requests that could be processed at the same time
+	maxRunningReqs int64
+	// nRunningReqs ithe the number of inference requests that are currently being processed
 	nRunningReqs int64
 	// loraInfo is prometheus gauge
 	loraInfo *prometheus.GaugeVec
@@ -67,6 +71,8 @@ type VllmSimulator struct {
 	waitingRequests *prometheus.GaugeVec
 	// kvCacheUsagePercentage is prometheus gauge
 	kvCacheUsagePercentage *prometheus.GaugeVec
+	// channel for requeasts to be passed to workers
+	reqChan chan *completionReqCtx
 }
 
 // baseResponseChoice contains base completion response's choice related information
@@ -117,6 +123,13 @@ type completionRequest interface {
 	getModel() string
 }
 
+type completionReqCtx struct {
+	completionReq    completionRequest
+	httpReqCtx       *fasthttp.RequestCtx
+	isChatCompletion bool
+	wg               *sync.WaitGroup
+}
+
 // v1/chat/completion
 // message defines vLLM chat completion message
 type message struct {
diff --git a/pkg/vllm-sim/metrics.go b/pkg/vllm-sim/metrics.go
@@ -21,6 +21,7 @@ package vllmsim
 import (
 	"strconv"
 	"strings"
+	"sync/atomic"
 	"time"
 
 	vllmapi "github.com/neuralmagic/vllm-sim/pkg/vllm-api"
@@ -131,6 +132,7 @@ func (s *VllmSimulator) reportLoras() {
 
 // reportRequests sets information about running completion requests
 func (s *VllmSimulator) reportRequests() {
+	nRunningReqs := atomic.LoadInt64(&(s.nRunningReqs))
 	s.runningRequests.WithLabelValues(
-		s.model).Set(float64(s.nRunningReqs))
+		s.model).Set(float64(nRunningReqs))
 }
diff --git a/pkg/vllm-sim/simulator.go b/pkg/vllm-sim/simulator.go
@@ -18,10 +18,12 @@ limitations under the License.
 package vllmsim
 
 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"net"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"time"
 
@@ -38,12 +40,13 @@ import (
 // New creates a new VllmSimulator instance with the given logger
 func New(logger logr.Logger) *VllmSimulator {
 	return &VllmSimulator{
-		logger: logger,
+		logger:  logger,
+		reqChan: make(chan *completionReqCtx),
 	}
 }
 
 // Start starts the simulator
-func (s *VllmSimulator) Start() error {
+func (s *VllmSimulator) Start(ctx context.Context) error {
 	// parse command line parameters
 	err := s.parseCommandParams()
 	if err != nil {
@@ -56,6 +59,10 @@ func (s *VllmSimulator) Start() error {
 		return err
 	}
 
+	// run request processing workers
+	for i := 1; i <= int(s.maxRunningReqs); i++ {
+		go s.reqProcessingWorker(ctx, i)
+	}
 	// start the http server
 	return s.startServer()
 }
@@ -71,6 +78,7 @@ func (s *VllmSimulator) parseCommandParams() error {
 	pflag.StringVar(&lorasStr, "lora", "", "List of LoRA adapters, separated by comma")
 	pflag.IntVar(&s.maxLoras, "max-loras", 1, "Maximum number of LoRAs in a single batch")
 	pflag.IntVar(&s.maxCpuLoras, "max-cpu-loras", 0, "Maximum number of LoRAs to store in CPU memory")
+	pflag.Int64Var(&s.maxRunningReqs, "max-running-requests", 5, "Maximum number of inference requests that could be processed at the same time (parameter to simulate requests waiting queue)")
 
 	pflag.Parse()
 
@@ -250,32 +258,60 @@ func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatComple
 		return
 	}
 
-	if s.isLora(model) {
-		// if current request's model is LoRA, add it to the list of running loras
-		value, ok := s.runningLoras.Load(model)
-		intValue := 0
-
-		if !ok {
-			s.logger.Info("Create referense counter", "model", model)
-			intValue = 0
-		} else {
-			intValue = value.(int)
-		}
-		s.runningLoras.Store(model, intValue+1)
-		s.logger.Info("Update LoRA referense counter", "model", model, "old value", intValue, "new value", intValue+1)
-
-		// TODO - check if thie request went to the waiting queue - add it to waiting map
-		s.reportLoras()
+	var wg sync.WaitGroup
+	wg.Add(1)
+	reqCtx := &completionReqCtx{
+		completionReq:    vllmReq,
+		httpReqCtx:       ctx,
+		isChatCompletion: isChatCompletion,
+		wg:               &wg,
 	}
-	atomic.AddInt64(&(s.nRunningReqs), 1)
-	s.reportRequests()
-
-	responseTxt := vllmReq.createResponseText(s.mode)
+	s.reqChan <- reqCtx
+	wg.Wait()
+}
 
-	if vllmReq.isStream() {
-		s.sendStreamingResponse(isChatCompletion, ctx, responseTxt, vllmReq.getModel())
-	} else {
-		s.sendResponse(isChatCompletion, ctx, responseTxt, vllmReq.getModel())
+func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
+	for {
+		select {
+		case <-ctx.Done():
+			s.logger.Info("reqProcessingWorker stopped:", "worker id", id)
+			return
+		case reqCtx, ok := <-s.reqChan:
+			if !ok {
+				s.logger.Info("reqProcessingWorker worker exiting: reqChan closed")
+				return
+			}
+			req := reqCtx.completionReq
+			model := req.getModel()
+			if s.isLora(model) {
+				// if current request's model is LoRA, add it to the list of running loras
+				value, ok := s.runningLoras.Load(model)
+				intValue := 0
+
+				if !ok {
+					s.logger.Info("Create referense counter", "model", model)
+					intValue = 0
+				} else {
+					intValue = value.(int)
+				}
+				s.runningLoras.Store(model, intValue+1)
+				s.logger.Info("Update LoRA referense counter", "model", model, "old value", intValue, "new value", intValue+1)
+
+				// TODO - check if thie request went to the waiting queue - add it to waiting map
+				s.reportLoras()
+			}
+			atomic.AddInt64(&(s.nRunningReqs), 1)
+			s.reportRequests()
+
+			responseTxt := req.createResponseText(s.mode)
+
+			if req.isStream() {
+				s.sendStreamingResponse(reqCtx.isChatCompletion, reqCtx.httpReqCtx, responseTxt, model)
+			} else {
+				s.sendResponse(reqCtx.isChatCompletion, reqCtx.httpReqCtx, responseTxt, model)
+			}
+			reqCtx.wg.Done()
+		}
 	}
 }