Skip to content

Commit 0e224fa

Browse files
committed
Support llm on custom resources beyond GPU.
Signed-off-by: liuxsh9 <liuxiaoshuang4@huawei.com>
1 parent 31878c9 commit 0e224fa

File tree

6 files changed

+42
-12
lines changed

6 files changed

+42
-12
lines changed

python/ray/llm/_internal/batch/processor/base.py

+5
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ class ProcessorConfig(BaseModelExtended):
2626
"You can tune the batch size to balance the throughput and fault-tolerance "
2727
"based on your use case. Defaults to 64.",
2828
)
29+
resources_per_worker: Optional[Dict[str, float]] = Field(
30+
default=None,
31+
description="This will override the default resources config for actors/workers, "
32+
"the default resource config for LLM Stage may be something like {'GPU': 1}."
33+
)
2934
accelerator_type: Optional[str] = Field(
3035
default=None,
3136
description="The accelerator type used by the LLM stage in a processor. "

python/ray/llm/_internal/batch/processor/vllm_engine_proc.py

+1
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ def build_vllm_engine_processor(
167167
# This is used to make sure we overlap batches to avoid the tail
168168
# latency of each batch.
169169
max_concurrency=config.max_concurrent_batches,
170+
resources=config.resources_per_worker,
170171
accelerator_type=config.accelerator_type,
171172
runtime_env=config.runtime_env,
172173
),

python/ray/llm/_internal/batch/stages/vllm_engine_stage.py

+21-11
Original file line numberDiff line numberDiff line change
@@ -521,26 +521,28 @@ def __del__(self):
521521
self.llm.shutdown()
522522

523523

524-
def _ray_scheduling_strategy_fn(num_gpus_per_instance: int, accelerator_type: str):
524+
def _ray_scheduling_strategy_fn(
525+
num_workers_per_instance: int, accelerator_type: str, resources: Optional[Dict[str, float]] = None
526+
):
525527
"""
526528
Create a Ray scheduling strategy for vLLM engine.
527529
528530
Args:
529-
num_gpus_per_instance: The number of GPUs per instance.
531+
num_workers_per_instance: The number of workers per instance.
530532
accelerator_type: The accelerator type.
531533
532534
Returns:
533535
The Ray scheduling strategy.
534536
"""
535537

536538
def _get_bundle() -> Dict[str, float]:
537-
bundle: Dict[str, float] = {"GPU": 1, "CPU": 1}
539+
bundle: Dict[str, float] = resources if resources else {"GPU": 1, "CPU": 1}
538540
if accelerator_type:
539541
bundle[f"accelerator_type:{accelerator_type}"] = 0.001
540542
return bundle
541543

542544
pg = ray.util.placement_group(
543-
[_get_bundle()] * num_gpus_per_instance,
545+
[_get_bundle()] * num_workers_per_instance,
544546
strategy="STRICT_PACK",
545547
)
546548
return dict(
@@ -569,6 +571,7 @@ def post_init(cls, values):
569571
The updated values.
570572
"""
571573
map_batches_kwargs = values["map_batches_kwargs"]
574+
resources_per_worker = map_batches_kwargs.get("resources")
572575
accelerator_type = map_batches_kwargs.get("accelerator_type", "")
573576
fn_constructor_kwargs = values["fn_constructor_kwargs"]
574577
engine_kwargs = fn_constructor_kwargs.get("engine_kwargs", {})
@@ -577,29 +580,36 @@ def post_init(cls, values):
577580
if accelerator_type:
578581
ray_remote_args["accelerator_type"] = accelerator_type
579582

580-
# Setup num_gpus required per vLLM engine.
583+
# Setup num_workers required per vLLM engine.
581584
tp_size = engine_kwargs.get("tensor_parallel_size", 1)
582585
pp_size = engine_kwargs.get("pipeline_parallel_size", 1)
583-
num_gpus = tp_size * pp_size
586+
num_workers = tp_size * pp_size
584587

585588
# Use the MP backend by default.
586589
engine_kwargs.setdefault("distributed_executor_backend", "mp")
587590
executor_backend = engine_kwargs.get("distributed_executor_backend")
588591

589-
# When Ray is used in the vLLM engine, we set num_gpus to 0 so that
592+
# When Ray is used in the vLLM engine, we set num_devices to 0 so that
590593
# Ray Data won't reserve GPUs in advance. Instead, we specify scheduling
591594
# strategy in .map_batches() arguments and let vLLM Ray executor to
592595
# create placement groups for each TP/PP worker.
593-
if executor_backend == "ray" and num_gpus > 1:
596+
num_mp_workers = num_workers
597+
if executor_backend == "ray" and num_workers > 1:
594598
# Note that we have to use partial() to pass a function
595599
# instead of an object.
596600
map_batches_kwargs["ray_remote_args_fn"] = partial(
597601
_ray_scheduling_strategy_fn,
598-
num_gpus,
602+
num_workers,
599603
accelerator_type,
600604
)
601-
num_gpus = 0
605+
num_mp_workers = 0
606+
607+
if not resources_per_worker:
608+
map_batches_kwargs["num_gpus"] = num_mp_workers
609+
else:
610+
ray_remote_args["resources"] = {
611+
key: value * num_mp_workers for key, value in resources_per_worker.items()
612+
}
602613

603-
map_batches_kwargs["num_gpus"] = num_gpus
604614
map_batches_kwargs.update(ray_remote_args)
605615
return values

python/ray/llm/_internal/serve/configs/server_models.py

+6
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,12 @@ class LLMConfig(BaseModelExtended):
187187
),
188188
)
189189

190+
resources_per_worker: Optional[Dict[str, float]] = Field(
191+
default=None,
192+
description="This will pass to config like `VLLMEngineConfig` and override "
193+
"the resources config for the workers in vLLM engine."
194+
)
195+
190196
accelerator_type: Optional[str] = Field(
191197
default=None,
192198
description=f"The type of accelerator runs the model on. Only the following values are supported: {str([t.value for t in GPUType])}",

python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ class VLLMEngineConfig(BaseModelExtended):
5151
None,
5252
description="Configuration for cloud storage mirror. This is for where the weights are downloaded from.",
5353
)
54+
resources_per_worker: Optional[Dict[str, float]] = Field(
55+
default=None,
56+
description="This overrides the vLLM engine worker's default resource configuration, "
57+
"the number of resources returned by `placement_bundles`."
58+
)
5459
accelerator_type: Optional[GPUType] = Field(
5560
None,
5661
description="The type of accelerator to use. This is used to determine the placement group strategy.",
@@ -104,6 +109,7 @@ def from_llm_config(cls, llm_config: LLMConfig) -> "VLLMEngineConfig":
104109
model_id=llm_config.model_id,
105110
hf_model_id=hf_model_id,
106111
mirror_config=mirror_config,
112+
accelerator_name=llm_config.accelerator_name,
107113
accelerator_type=llm_config.accelerator_type,
108114
engine_kwargs=llm_config.engine_kwargs,
109115
runtime_env=llm_config.runtime_env,
@@ -134,7 +140,7 @@ def placement_strategy(self) -> str:
134140

135141
@property
136142
def placement_bundles(self) -> List[Dict[str, float]]:
137-
bundle = {"GPU": 1}
143+
bundle = {self.accelerator_name: 1}
138144
if self.accelerator_type:
139145
bundle[self.ray_accelerator_type()] = 0.001
140146
bundles = [bundle for _ in range(self.num_gpu_workers)]

python/ray/util/accelerators/accelerators.py

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
GOOGLE_TPU_V5P = "TPU-V5P"
2727
GOOGLE_TPU_V5LITEPOD = "TPU-V5LITEPOD"
2828
GOOGLE_TPU_V6E = "TPU-V6E"
29+
HUAWEI_NPU_910B = "Ascend910B"
30+
HUAWEI_NPU_910B4 = "Ascend910B4"
2931

3032
# Use these instead of NVIDIA_A100 if you need a specific accelerator size. Note that
3133
# these labels are not auto-added to nodes, you'll have to add them manually in

0 commit comments

Comments
 (0)