Skip to content

Commit 89509d8

Browse files
Change back batch infer GPU util and add tool completion client changes (#465)
* Change back batch infer gpu util * Add client changes * fixes * bump
1 parent c833e91 commit 89509d8

File tree

8 files changed

+94
-5
lines changed

8 files changed

+94
-5
lines changed

clients/python/llmengine/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
__version__ = "0.0.0b25"
15+
__version__ = "0.0.0b26"
1616

1717
import os
1818
from typing import Sequence

clients/python/llmengine/completion.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
CreateBatchCompletionsRequest,
1111
CreateBatchCompletionsRequestContent,
1212
CreateBatchCompletionsResponse,
13+
ToolConfig,
1314
)
1415

1516
COMPLETION_TIMEOUT = 300
@@ -412,6 +413,7 @@ def batch_create(
412413
input_data_path: Optional[str] = None,
413414
data_parallelism: int = 1,
414415
max_runtime_sec: int = 24 * 3600,
416+
tool_config: Optional[ToolConfig] = None,
415417
) -> CreateBatchCompletionsResponse:
416418
"""
417419
Creates a batch completion for the provided input data. The job runs offline and does not depend on an existing model endpoint.
@@ -437,6 +439,13 @@ def batch_create(
437439
max_runtime_sec (int):
438440
The maximum runtime of the batch completion in seconds. Defaults to 24 hours.
439441
442+
tool_config (Optional[ToolConfig]):
443+
Configuration for tool use.
444+
NOTE: this config is highly experimental and signature will change significantly in future iterations.
445+
Currently only Python code evaluator is supported.
446+
Python code context starts with "\`\`\`python\\n" and ends with "\\n>>>\\n", data before "\\n\`\`\`\\n" and content end will be replaced by the Python execution results.
447+
Please format prompts accordingly and provide examples so LLMs could properly generate Python code.
448+
440449
Returns:
441450
response (CreateBatchCompletionsResponse): The response containing the job id.
442451
@@ -480,6 +489,29 @@ def batch_create(
480489
)
481490
print(response.json())
482491
```
492+
493+
=== "Batch completions with prompts and use tool"
494+
```python
495+
from llmengine import Completion
496+
from llmengine.data_types import CreateBatchCompletionsModelConfig, CreateBatchCompletionsRequestContent, ToolConfig
497+
498+
# Store CreateBatchCompletionsRequestContent data into input file "s3://my-input-path"
499+
500+
response = Completion.batch_create(
501+
input_data_path="s3://my-input-path",
502+
output_data_path="s3://my-output-path",
503+
model_config=CreateBatchCompletionsModelConfig(
504+
model="llama-2-7b",
505+
checkpoint_path="s3://checkpoint-path",
506+
labels={"team":"my-team", "product":"my-product"}
507+
),
508+
data_parallelism=2,
509+
tool_config=ToolConfig(
510+
name="code_evaluator",
511+
)
512+
)
513+
print(response.json())
514+
```
483515
"""
484516
data = CreateBatchCompletionsRequest(
485517
model_config=model_config,
@@ -488,6 +520,7 @@ def batch_create(
488520
output_data_path=output_data_path,
489521
data_parallelism=data_parallelism,
490522
max_runtime_sec=max_runtime_sec,
523+
tool_config=tool_config,
491524
).dict()
492525
response = cls.post_sync(
493526
resource_name="v1/llm/batch-completions",

clients/python/llmengine/data_types.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
DTOs for LLM APIs.
33
"""
4+
45
import datetime
56
from enum import Enum
67
from typing import Any, Dict, List, Literal, Optional, Union
@@ -658,6 +659,30 @@ class CreateBatchCompletionsModelConfig(BaseModel):
658659
"""
659660

660661

662+
class ToolConfig(BaseModel):
663+
"""
664+
Configuration for tool use.
665+
NOTE: this config is highly experimental and signature will change significantly in future iterations.
666+
"""
667+
668+
name: str
669+
"""
670+
Name of the tool to use for the batch inference.
671+
"""
672+
max_iterations: Optional[int] = 10
673+
"""
674+
Maximum number of iterations to run the tool.
675+
"""
676+
execution_timeout_seconds: Optional[int] = 60
677+
"""
678+
Maximum runtime of the tool in seconds.
679+
"""
680+
should_retry_on_error: Optional[bool] = True
681+
"""
682+
Whether to retry the tool on error.
683+
"""
684+
685+
661686
class CreateBatchCompletionsRequest(BaseModel):
662687
"""
663688
Request object for batch completions.
@@ -685,6 +710,11 @@ class CreateBatchCompletionsRequest(BaseModel):
685710
"""
686711
Maximum runtime of the batch inference in seconds. Default to one day.
687712
"""
713+
tool_config: Optional[ToolConfig] = None
714+
"""
715+
Configuration for tool use.
716+
NOTE: this config is highly experimental and signature will change significantly in future iterations.
717+
"""
688718

689719

690720
class CreateBatchCompletionsResponse(BaseModel):

clients/python/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "scale-llm-engine"
3-
version = "0.0.0.beta25"
3+
version = "0.0.0.beta26"
44
description = "Scale LLM Engine Python client"
55
license = "Apache-2.0"
66
authors = ["Phil Chen <phil.chen@scale.com>"]

clients/python/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
setup(
44
name="scale-llm-engine",
55
python_requires=">=3.7",
6-
version="0.0.0.beta25",
6+
version="0.0.0.beta26",
77
packages=find_packages(),
88
)

docs/api/data_types.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@
143143
- model_config
144144
- data_parallelism
145145
- max_runtime_sec
146+
- tool_config
146147

147148
::: llmengine.CreateBatchCompletionsResponse
148149
options:

docs/guides/completions.md

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ asyncio.run(main())
122122

123123
## Batch completions
124124

125-
The Python client also supports batch completions. Batch completions supports distributing data to multiple workers to accelerate inference. It also tries to maximize throughput so the completions should finish quite a bit faster than hitting models through HTTP. Use [Completion.batch_complete](../../api/python_client/#llmengine.completion.Completion.batch_complete) to utilize batch completions.
125+
The Python client also supports batch completions. Batch completions supports distributing data to multiple workers to accelerate inference. It also tries to maximize throughput so the completions should finish quite a bit faster than hitting models through HTTP. Use [Completion.batch_create](../../api/python_client/#llmengine.Completion.batch_create) to utilize batch completions.
126126

127127
Some examples of batch completions:
128128

@@ -169,6 +169,30 @@ response = Completion.batch_create(
169169
print(response.job_id)
170170
```
171171

172+
=== "Batch completions with prompts and use tool"
173+
For how to properly use the tool please see [Completion.batch_create](../../api/python_client/#llmengine.Completion.batch_create) tool_config doc.
174+
```python
175+
from llmengine import Completion
176+
from llmengine.data_types import CreateBatchCompletionsModelConfig, CreateBatchCompletionsRequestContent, ToolConfig
177+
178+
# Store CreateBatchCompletionsRequestContent data into input file "s3://my-input-path"
179+
180+
response = Completion.batch_create(
181+
input_data_path="s3://my-input-path",
182+
output_data_path="s3://my-output-path",
183+
model_config=CreateBatchCompletionsModelConfig(
184+
model="llama-2-7b",
185+
checkpoint_path="s3://checkpoint-path",
186+
labels={"team":"my-team", "product":"my-product"}
187+
),
188+
data_parallelism=2,
189+
tool_config=ToolConfig(
190+
name="code_evaluator",
191+
)
192+
)
193+
print(response.json())
194+
```
195+
172196
## Which model should I use?
173197

174198
See the [Model Zoo](../../model_zoo) for more information on best practices for which model to use for Completions.

model-engine/model_engine_server/inference/batch_inference/vllm_batch.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def get_vllm_engine(model, request):
132132
tensor_parallel_size=request.model_config.num_shards,
133133
seed=request.model_config.seed or 0,
134134
disable_log_requests=True,
135-
gpu_memory_utilization=0.8, # To avoid OOM errors when there's host machine GPU usage
135+
gpu_memory_utilization=0.9,
136136
)
137137

138138
llm = AsyncLLMEngine.from_engine_args(engine_args)
@@ -432,6 +432,7 @@ def check_unknown_startup_memory_usage(): # pragma: no cover
432432
"""Check for unknown memory usage at startup."""
433433
gpu_free_memory = get_gpu_free_memory()
434434
if gpu_free_memory is not None:
435+
print(f"GPU free memory at startup in MB: {gpu_free_memory}")
435436
min_mem = min(gpu_free_memory)
436437
max_mem = max(gpu_free_memory)
437438
if max_mem - min_mem > 10:

0 commit comments

Comments
 (0)