@@ -30,21 +30,21 @@ class CreateLLMModelEndpointV1Request(BaseModel):
30
30
# LLM specific fields
31
31
model_name : str
32
32
source : LLMSource = LLMSource .HUGGING_FACE
33
- inference_framework : LLMInferenceFramework = LLMInferenceFramework .DEEPSPEED
34
- inference_framework_image_tag : str
33
+ inference_framework : LLMInferenceFramework = LLMInferenceFramework .VLLM
34
+ inference_framework_image_tag : str = "latest"
35
35
num_shards : int = 1
36
36
"""
37
- Number of shards to distribute the model onto GPUs. Only affects behavior for text-generation-inference models
37
+ Number of shards to distribute the model onto GPUs.
38
38
"""
39
39
40
40
quantize : Optional [Quantization ] = None
41
41
"""
42
- Whether to quantize the model. Only affect behavior for text-generation-inference models
42
+ Whether to quantize the model.
43
43
"""
44
44
45
45
checkpoint_path : Optional [str ] = None
46
46
"""
47
- Path to the checkpoint to load the model from. Only affects behavior for text-generation-inference models
47
+ Path to the checkpoint to load the model from.
48
48
"""
49
49
50
50
# General endpoint fields
@@ -102,17 +102,17 @@ class UpdateLLMModelEndpointV1Request(BaseModel):
102
102
inference_framework_image_tag : Optional [str ]
103
103
num_shards : Optional [int ]
104
104
"""
105
- Number of shards to distribute the model onto GPUs. Only affects behavior for text-generation-inference models
105
+ Number of shards to distribute the model onto GPUs.
106
106
"""
107
107
108
108
quantize : Optional [Quantization ]
109
109
"""
110
- Whether to quantize the model. Only affect behavior for text-generation-inference models
110
+ Whether to quantize the model.
111
111
"""
112
112
113
113
checkpoint_path : Optional [str ]
114
114
"""
115
- Path to the checkpoint to load the model from. Only affects behavior for text-generation-inference models
115
+ Path to the checkpoint to load the model from.
116
116
"""
117
117
118
118
# General endpoint fields
@@ -220,7 +220,7 @@ class CompletionStreamV1Request(BaseModel):
220
220
"""
221
221
return_token_log_probs : Optional [bool ] = False
222
222
"""
223
- Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models
223
+ Whether to return the log probabilities of the tokens.
224
224
"""
225
225
presence_penalty : Optional [float ] = Field (default = None , ge = 0.0 , le = 2.0 )
226
226
"""
@@ -359,3 +359,104 @@ class ModelDownloadResponse(BaseModel):
359
359
360
360
class DeleteLLMEndpointResponse (BaseModel ):
361
361
deleted : bool
362
+
363
+
364
+ class CreateBatchCompletionsRequestContent (BaseModel ):
365
+ prompts : List [str ]
366
+ max_new_tokens : int
367
+ temperature : float = Field (ge = 0.0 , le = 1.0 )
368
+ """
369
+ Temperature of the sampling. Setting to 0 equals to greedy sampling.
370
+ """
371
+ stop_sequences : Optional [List [str ]] = None
372
+ """
373
+ List of sequences to stop the completion at.
374
+ """
375
+ return_token_log_probs : Optional [bool ] = False
376
+ """
377
+ Whether to return the log probabilities of the tokens.
378
+ """
379
+ presence_penalty : Optional [float ] = Field (default = None , ge = 0.0 , le = 2.0 )
380
+ """
381
+ Only supported in vllm, lightllm
382
+ Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
383
+ """
384
+ frequency_penalty : Optional [float ] = Field (default = None , ge = 0.0 , le = 2.0 )
385
+ """
386
+ Only supported in vllm, lightllm
387
+ Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
388
+ """
389
+ top_k : Optional [int ] = Field (default = None , ge = - 1 )
390
+ """
391
+ Controls the number of top tokens to consider. -1 means consider all tokens.
392
+ """
393
+ top_p : Optional [float ] = Field (default = None , gt = 0.0 , le = 1.0 )
394
+ """
395
+ Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens.
396
+ """
397
+
398
+
399
+ class CreateBatchCompletionsModelConfig (BaseModel ):
400
+ model : str
401
+ checkpoint_path : Optional [str ] = None
402
+ """
403
+ Path to the checkpoint to load the model from.
404
+ """
405
+ labels : Dict [str , str ]
406
+ """
407
+ Labels to attach to the batch inference job.
408
+ """
409
+ num_shards : Optional [int ] = 1
410
+ """
411
+ Suggested number of shards to distribute the model. When not specified, will infer the number of shards based on model config.
412
+ System may decide to use a different number than the given value.
413
+ """
414
+ quantize : Optional [Quantization ] = None
415
+ """
416
+ Whether to quantize the model.
417
+ """
418
+ seed : Optional [int ] = None
419
+ """
420
+ Random seed for the model.
421
+ """
422
+
423
+
424
+ class CreateBatchCompletionsRequest (BaseModel ):
425
+ """
426
+ Request object for batch completions.
427
+ """
428
+
429
+ input_data_path : Optional [str ]
430
+ output_data_path : str
431
+ """
432
+ Path to the output file. The output file will be a JSON file of type List[CompletionOutput].
433
+ """
434
+ content : Optional [CreateBatchCompletionsRequestContent ] = None
435
+ """
436
+ Either `input_data_path` or `content` needs to be provided.
437
+ When input_data_path is provided, the input file should be a JSON file of type BatchCompletionsRequestContent.
438
+ """
439
+ model_config : CreateBatchCompletionsModelConfig
440
+ """
441
+ Model configuration for the batch inference. Hardware configurations are inferred.
442
+ """
443
+ data_parallelism : Optional [int ] = Field (default = 1 , ge = 1 , le = 64 )
444
+ """
445
+ Number of replicas to run the batch inference. More replicas are slower to schedule but faster to inference.
446
+ """
447
+ max_runtime_sec : Optional [int ] = Field (default = 24 * 3600 , ge = 1 , le = 2 * 24 * 3600 )
448
+ """
449
+ Maximum runtime of the batch inference in seconds. Default to one day.
450
+ """
451
+
452
+
453
+ class CreateBatchCompletionsResponse (BaseModel ):
454
+ job_id : str
455
+
456
+
457
+ class GetBatchCompletionsResponse (BaseModel ):
458
+ progress : float
459
+ """
460
+ Progress of the batch inference in percentage from 0 to 100.
461
+ """
462
+ finished : bool
0 commit comments