@@ -307,6 +307,8 @@ type ChatCompletionRequest struct {
307
307
// Such as think mode for qwen3. "chat_template_kwargs": {"enable_thinking": false}
308
308
// https://qwen.readthedocs.io/en/latest/deployment/vllm.html#thinking-non-thinking-modes
309
309
ChatTemplateKwargs map [string ]any `json:"chat_template_kwargs,omitempty"`
310
+ // Specifies the latency tier to use for processing the request.
311
+ ServiceTier ServiceTier `json:"service_tier,omitempty"`
310
312
}
311
313
312
314
type StreamOptions struct {
@@ -390,6 +392,15 @@ const (
390
392
FinishReasonNull FinishReason = "null"
391
393
)
392
394
395
+ type ServiceTier string
396
+
397
+ const (
398
+ ServiceTierAuto ServiceTier = "auto"
399
+ ServiceTierDefault ServiceTier = "default"
400
+ ServiceTierFlex ServiceTier = "flex"
401
+ ServiceTierPriority ServiceTier = "priority"
402
+ )
403
+
393
404
func (r FinishReason ) MarshalJSON () ([]byte , error ) {
394
405
if r == FinishReasonNull || r == "" {
395
406
return []byte ("null" ), nil
@@ -422,6 +433,7 @@ type ChatCompletionResponse struct {
422
433
Usage Usage `json:"usage"`
423
434
SystemFingerprint string `json:"system_fingerprint"`
424
435
PromptFilterResults []PromptFilterResult `json:"prompt_filter_results,omitempty"`
436
+ ServiceTier ServiceTier `json:"service_tier,omitempty"`
425
437
426
438
httpHeader
427
439
}
0 commit comments