Support llm on custom resources beyond GPU.

liuxsh9 · liuxsh9 · commit 0e224fa46552 · 2025-03-18T10:19:58.000+08:00
Signed-off-by: liuxsh9 &lt;liuxiaoshuang4@huawei.com&gt;
diff --git a/python/ray/llm/_internal/batch/processor/base.py b/python/ray/llm/_internal/batch/processor/base.py
@@ -26,6 +26,11 @@ class ProcessorConfig(BaseModelExtended):
         "You can tune the batch size to balance the throughput and fault-tolerance "
         "based on your use case. Defaults to 64.",
     )
+    resources_per_worker: Optional[Dict[str, float]] = Field(
+        default=None,
+        description="This will override the default resources config for actors/workers, "
+        "the default resource config for LLM Stage may be something like {'GPU': 1}."
+    )
     accelerator_type: Optional[str] = Field(
         default=None,
         description="The accelerator type used by the LLM stage in a processor. "
diff --git a/python/ray/llm/_internal/batch/processor/vllm_engine_proc.py b/python/ray/llm/_internal/batch/processor/vllm_engine_proc.py
@@ -167,6 +167,7 @@ def build_vllm_engine_processor(
                 # This is used to make sure we overlap batches to avoid the tail
                 # latency of each batch.
                 max_concurrency=config.max_concurrent_batches,
+                resources=config.resources_per_worker,
                 accelerator_type=config.accelerator_type,
                 runtime_env=config.runtime_env,
             ),
diff --git a/python/ray/llm/_internal/batch/stages/vllm_engine_stage.py b/python/ray/llm/_internal/batch/stages/vllm_engine_stage.py
@@ -521,26 +521,28 @@ def __del__(self):
             self.llm.shutdown()
 
 
-def _ray_scheduling_strategy_fn(num_gpus_per_instance: int, accelerator_type: str):
+def _ray_scheduling_strategy_fn(
+    num_workers_per_instance: int, accelerator_type: str, resources: Optional[Dict[str, float]] = None
+):
     """
     Create a Ray scheduling strategy for vLLM engine.
 
     Args:
-        num_gpus_per_instance: The number of GPUs per instance.
+        num_workers_per_instance: The number of workers per instance.
         accelerator_type: The accelerator type.
 
     Returns:
         The Ray scheduling strategy.
     """
 
     def _get_bundle() -> Dict[str, float]:
-        bundle: Dict[str, float] = {"GPU": 1, "CPU": 1}
+        bundle: Dict[str, float] = resources if resources else {"GPU": 1, "CPU": 1}
         if accelerator_type:
             bundle[f"accelerator_type:{accelerator_type}"] = 0.001
         return bundle
 
     pg = ray.util.placement_group(
-        [_get_bundle()] * num_gpus_per_instance,
+        [_get_bundle()] * num_workers_per_instance,
         strategy="STRICT_PACK",
     )
     return dict(
@@ -569,6 +571,7 @@ def post_init(cls, values):
             The updated values.
         """
         map_batches_kwargs = values["map_batches_kwargs"]
+        resources_per_worker = map_batches_kwargs.get("resources")
         accelerator_type = map_batches_kwargs.get("accelerator_type", "")
         fn_constructor_kwargs = values["fn_constructor_kwargs"]
         engine_kwargs = fn_constructor_kwargs.get("engine_kwargs", {})
@@ -577,29 +580,36 @@ def post_init(cls, values):
         if accelerator_type:
             ray_remote_args["accelerator_type"] = accelerator_type
 
-        # Setup num_gpus required per vLLM engine.
+        # Setup num_workers required per vLLM engine.
         tp_size = engine_kwargs.get("tensor_parallel_size", 1)
         pp_size = engine_kwargs.get("pipeline_parallel_size", 1)
-        num_gpus = tp_size * pp_size
+        num_workers = tp_size * pp_size
 
         # Use the MP backend by default.
         engine_kwargs.setdefault("distributed_executor_backend", "mp")
         executor_backend = engine_kwargs.get("distributed_executor_backend")
 
-        # When Ray is used in the vLLM engine, we set num_gpus to 0 so that
+        # When Ray is used in the vLLM engine, we set num_devices to 0 so that
         # Ray Data won't reserve GPUs in advance. Instead, we specify scheduling
         # strategy in .map_batches() arguments and let vLLM Ray executor to
         # create placement groups for each TP/PP worker.
-        if executor_backend == "ray" and num_gpus > 1:
+        num_mp_workers = num_workers
+        if executor_backend == "ray" and num_workers > 1:
             # Note that we have to use partial() to pass a function
             # instead of an object.
             map_batches_kwargs["ray_remote_args_fn"] = partial(
                 _ray_scheduling_strategy_fn,
-                num_gpus,
+                num_workers,
                 accelerator_type,
             )
-            num_gpus = 0
+            num_mp_workers = 0
+
+        if not resources_per_worker:
+            map_batches_kwargs["num_gpus"] = num_mp_workers
+        else:
+            ray_remote_args["resources"] = {
+                key: value * num_mp_workers for key, value in resources_per_worker.items()
+            }
 
-        map_batches_kwargs["num_gpus"] = num_gpus
         map_batches_kwargs.update(ray_remote_args)
         return values
diff --git a/python/ray/llm/_internal/serve/configs/server_models.py b/python/ray/llm/_internal/serve/configs/server_models.py
@@ -187,6 +187,12 @@ class LLMConfig(BaseModelExtended):
         ),
     )
 
+    resources_per_worker: Optional[Dict[str, float]] = Field(
+        default=None,
+        description="This will pass to config like `VLLMEngineConfig` and override "
+        "the resources config for the workers in vLLM engine."
+    )
+
     accelerator_type: Optional[str] = Field(
         default=None,
         description=f"The type of accelerator runs the model on. Only the following values are supported: {str([t.value for t in GPUType])}",
diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py
@@ -51,6 +51,11 @@ class VLLMEngineConfig(BaseModelExtended):
         None,
         description="Configuration for cloud storage mirror. This is for where the weights are downloaded from.",
     )
+    resources_per_worker: Optional[Dict[str, float]] = Field(
+        default=None,
+        description="This overrides the vLLM engine worker's default resource configuration, "
+        "the number of resources returned by `placement_bundles`."
+    )
     accelerator_type: Optional[GPUType] = Field(
         None,
         description="The type of accelerator to use. This is used to determine the placement group strategy.",
@@ -104,6 +109,7 @@ def from_llm_config(cls, llm_config: LLMConfig) -> "VLLMEngineConfig":
             model_id=llm_config.model_id,
             hf_model_id=hf_model_id,
             mirror_config=mirror_config,
+            accelerator_name=llm_config.accelerator_name,
             accelerator_type=llm_config.accelerator_type,
             engine_kwargs=llm_config.engine_kwargs,
             runtime_env=llm_config.runtime_env,
@@ -134,7 +140,7 @@ def placement_strategy(self) -> str:
 
     @property
     def placement_bundles(self) -> List[Dict[str, float]]:
-        bundle = {"GPU": 1}
+        bundle = {self.accelerator_name: 1}
         if self.accelerator_type:
             bundle[self.ray_accelerator_type()] = 0.001
         bundles = [bundle for _ in range(self.num_gpu_workers)]
diff --git a/python/ray/util/accelerators/accelerators.py b/python/ray/util/accelerators/accelerators.py
@@ -26,6 +26,8 @@
 GOOGLE_TPU_V5P = "TPU-V5P"
 GOOGLE_TPU_V5LITEPOD = "TPU-V5LITEPOD"
 GOOGLE_TPU_V6E = "TPU-V6E"
+HUAWEI_NPU_910B = "Ascend910B"
+HUAWEI_NPU_910B4 = "Ascend910B4"
 
 # Use these instead of NVIDIA_A100 if you need a specific accelerator size. Note that
 # these labels are not auto-added to nodes, you'll have to add them manually in