@@ -72,6 +72,8 @@ type VllmSimulator struct {
72
72
mode string
73
73
// model defines the current base model name
74
74
model string
75
+ // one or many names exposed by the API
76
+ servedModelNames []string
75
77
// loraAdaptors contains list of LoRA available adaptors
76
78
loraAdaptors sync.Map
77
79
// maxLoras defines maximum number of loaded loras
@@ -150,6 +152,7 @@ func (s *VllmSimulator) parseCommandParams() error {
150
152
f .IntVar (& s .interTokenLatency , "inter-token-latency" , 0 , "Time to generate one token (in milliseconds)" )
151
153
f .IntVar (& s .timeToFirstToken , "time-to-first-token" , 0 , "Time to first token (in milliseconds)" )
152
154
f .StringVar (& s .model , "model" , "" , "Currently 'loaded' model" )
155
+ f .StringSliceVar (& s .servedModelNames , "served-model-name" , nil , "Model names exposed by the API (comma or space-separated)" )
153
156
var lorasStr string
154
157
f .StringVar (& lorasStr , "lora" , "" , "List of LoRA adapters, separated by comma" )
155
158
f .IntVar (& s .maxLoras , "max-loras" , 1 , "Maximum number of LoRAs in a single batch" )
@@ -169,6 +172,14 @@ func (s *VllmSimulator) parseCommandParams() error {
169
172
if s .model == "" {
170
173
return errors .New ("model parameter is empty" )
171
174
}
175
+
176
+ // Upstream vLLM behaviour: when --served-model-name is not provided,
177
+ // it falls back to using the value of --model as the single public name
178
+ // returned by the API and exposed in Prometheus metrics.
179
+ if len (s .servedModelNames ) == 0 {
180
+ s .servedModelNames = []string {s .model }
181
+ }
182
+
172
183
if s .mode != modeEcho && s .mode != modeRandom {
173
184
return fmt .Errorf ("invalid mode '%s', valid values are 'random' and 'echo'" , s .mode )
174
185
}
@@ -301,10 +312,11 @@ func (s *VllmSimulator) HandleUnloadLora(ctx *fasthttp.RequestCtx) {
301
312
302
313
// isValidModel checks if the given model is the base model or one of "loaded" LoRAs
303
314
func (s * VllmSimulator ) isValidModel (model string ) bool {
304
- if model == s .model {
305
- return true
315
+ for _ , name := range s .servedModelNames {
316
+ if model == name {
317
+ return true
318
+ }
306
319
}
307
-
308
320
for _ , lora := range s .getLoras () {
309
321
if model == lora {
310
322
return true
@@ -372,6 +384,8 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
372
384
373
385
req := reqCtx .completionReq
374
386
model := req .getModel ()
387
+ displayModel := s .getDisplayedModelName (model )
388
+
375
389
if s .isLora (model ) {
376
390
// if current request's model is LoRA, add it to the list of running loras
377
391
value , ok := s .runningLoras .Load (model )
@@ -397,8 +411,11 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
397
411
var err error
398
412
var toolCalls []toolCall
399
413
var completionTokens int
400
- if reqCtx .isChatCompletion && req .getToolChoice () != toolChoiceNone && req .getTools () != nil {
401
- toolCalls , finishReason , completionTokens , err = createToolCalls (req .getTools (), req .getToolChoice ())
414
+ if reqCtx .isChatCompletion &&
415
+ req .getToolChoice () != toolChoiceNone &&
416
+ req .getTools () != nil {
417
+ toolCalls , finishReason , completionTokens , err =
418
+ createToolCalls (req .getTools (), req .getToolChoice ())
402
419
}
403
420
if toolCalls == nil && err == nil {
404
421
// Either no tool calls were defined, or we randomly chose not to create tool calls,
@@ -426,10 +443,20 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
426
443
usageDataToSend = & usageData
427
444
}
428
445
s .sendStreamingResponse (
429
- & streamingContext {ctx : reqCtx .httpReqCtx , isChatCompletion : reqCtx .isChatCompletion , model : model },
430
- responseTokens , toolCalls , finishReason , usageDataToSend )
446
+ & streamingContext {
447
+ ctx : reqCtx .httpReqCtx ,
448
+ isChatCompletion : reqCtx .isChatCompletion ,
449
+ model : displayModel ,
450
+ },
451
+ responseTokens , toolCalls , finishReason , usageDataToSend ,
452
+ )
431
453
} else {
432
- s .sendResponse (reqCtx .isChatCompletion , reqCtx .httpReqCtx , responseTokens , toolCalls , model , finishReason ,
454
+ s .sendResponse (reqCtx .isChatCompletion ,
455
+ reqCtx .httpReqCtx ,
456
+ responseTokens ,
457
+ toolCalls ,
458
+ displayModel ,
459
+ finishReason ,
433
460
& usageData )
434
461
}
435
462
}
@@ -444,8 +471,8 @@ func (s *VllmSimulator) responseSentCallback(model string) {
444
471
atomic .AddInt64 (& (s .nRunningReqs ), - 1 )
445
472
s .reportRunningRequests ()
446
473
447
- if model == s . model {
448
- // this is the base model - do not continue
474
+ // Only LoRA models require reference-count handling.
475
+ if ! s . isLora ( model ) {
449
476
return
450
477
}
451
478
@@ -515,15 +542,16 @@ func (s *VllmSimulator) HandleError(_ *fasthttp.RequestCtx, err error) {
515
542
// as defined by isChatCompletion
516
543
// respTokens - tokenized content to be sent in the response
517
544
// toolCalls - tool calls to be sent in the response
518
- // model - model name
519
545
// finishReason - a pointer to string that represents finish reason, can be nil or stop or length, ...
520
546
// usageData - usage (tokens statistics) for this response
521
- func (s * VllmSimulator ) createCompletionResponse (isChatCompletion bool , respTokens []string , toolCalls []toolCall , model string ,
522
- finishReason * string , usageData * usage ) completionResponse {
547
+ // modelName - display name returned to the client and used in metrics. It is either the first alias
548
+ // from --served-model-name (for a base-model request) or the LoRA adapter name (for a LoRA request).
549
+ func (s * VllmSimulator ) createCompletionResponse (isChatCompletion bool , respTokens []string , toolCalls []toolCall ,
550
+ finishReason * string , usageData * usage , modelName string ) completionResponse {
523
551
baseResp := baseCompletionResponse {
524
552
ID : chatComplIDPrefix + uuid .NewString (),
525
553
Created : time .Now ().Unix (),
526
- Model : model ,
554
+ Model : modelName ,
527
555
Usage : usageData ,
528
556
}
529
557
baseChoice := baseResponseChoice {Index : 0 , FinishReason : finishReason }
@@ -555,12 +583,13 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke
555
583
// according the value of isChatCompletion
556
584
// respTokens - tokenized content to be sent in the response
557
585
// toolCalls - tool calls to be sent in the response
558
- // model - model name
586
+ // modelName - display name returned to the client and used in metrics. It is either the first alias
587
+ // from --served-model-name (for a base-model request) or the LoRA adapter name (for a LoRA request).
559
588
// finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools
560
589
// usageData - usage (tokens statistics) for this response
561
590
func (s * VllmSimulator ) sendResponse (isChatCompletion bool , ctx * fasthttp.RequestCtx , respTokens []string , toolCalls []toolCall ,
562
- model string , finishReason string , usageData * usage ) {
563
- resp := s .createCompletionResponse (isChatCompletion , respTokens , toolCalls , model , & finishReason , usageData )
591
+ modelName string , finishReason string , usageData * usage ) {
592
+ resp := s .createCompletionResponse (isChatCompletion , respTokens , toolCalls , & finishReason , usageData , modelName )
564
593
565
594
data , err := json .Marshal (resp )
566
595
if err != nil {
@@ -578,32 +607,35 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
578
607
ctx .Response .Header .SetStatusCode (fasthttp .StatusOK )
579
608
ctx .Response .SetBody (data )
580
609
581
- s .responseSentCallback (model )
610
+ s .responseSentCallback (modelName )
582
611
}
583
612
584
613
// createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist
585
614
func (s * VllmSimulator ) createModelsResponse () * vllmapi.ModelsResponse {
586
615
modelsResp := vllmapi.ModelsResponse {Object : "list" , Data : []vllmapi.ModelsResponseModelInfo {}}
587
616
588
- // add base model's info
589
- modelsResp .Data = append (modelsResp .Data , vllmapi.ModelsResponseModelInfo {
590
- ID : s .model ,
591
- Object : vllmapi .ObjectModel ,
592
- Created : time .Now ().Unix (),
593
- OwnedBy : "vllm" ,
594
- Root : s .model ,
595
- Parent : nil ,
596
- })
617
+ // Advertise every public model alias
618
+ for _ , alias := range s .servedModelNames {
619
+ modelsResp .Data = append (modelsResp .Data , vllmapi.ModelsResponseModelInfo {
620
+ ID : alias ,
621
+ Object : vllmapi .ObjectModel ,
622
+ Created : time .Now ().Unix (),
623
+ OwnedBy : "vllm" ,
624
+ Root : alias ,
625
+ Parent : nil ,
626
+ })
627
+ }
597
628
598
629
// add LoRA adapter's info
630
+ parent := s .servedModelNames [0 ]
599
631
for _ , lora := range s .getLoras () {
600
632
modelsResp .Data = append (modelsResp .Data , vllmapi.ModelsResponseModelInfo {
601
633
ID : lora ,
602
634
Object : vllmapi .ObjectModel ,
603
635
Created : time .Now ().Unix (),
604
636
OwnedBy : "vllm" ,
605
637
Root : lora ,
606
- Parent : & s . model ,
638
+ Parent : & parent ,
607
639
})
608
640
}
609
641
@@ -625,3 +657,13 @@ func (s *VllmSimulator) HandleReady(ctx *fasthttp.RequestCtx) {
625
657
ctx .Response .Header .SetStatusCode (fasthttp .StatusOK )
626
658
ctx .Response .SetBody ([]byte ("{}" ))
627
659
}
660
+
661
+ // getDisplayedModelName returns the model name that must appear in API
662
+ // responses. LoRA adapters keep their explicit name, while all base-model
663
+ // requests are surfaced as the first alias from --served-model-name.
664
+ func (s * VllmSimulator ) getDisplayedModelName (reqModel string ) string {
665
+ if s .isLora (reqModel ) {
666
+ return reqModel
667
+ }
668
+ return s .servedModelNames [0 ]
669
+ }
0 commit comments