[ray.llm] Refactor model download utilities (#51604)

comaniac · web-flow · commit f3afcbaa2a4d · 2025-03-24T15:55:44.000-07:00
Signed-off-by: Cody Yu &lt;hao.yu.cody@gmail.com&gt;
diff --git a/python/ray/llm/_internal/batch/processor/vllm_engine_proc.py b/python/ray/llm/_internal/batch/processor/vllm_engine_proc.py
@@ -11,7 +11,6 @@
     ProcessorConfig,
     ProcessorBuilder,
 )
-from ray.llm._internal.batch.utils import download_hf_model
 from ray.llm._internal.batch.stages import (
     vLLMEngineStage,
     ChatTemplateStage,
@@ -25,6 +24,10 @@
     BatchModelTelemetry,
 )
 from ray.llm._internal.common.observability.telemetry_utils import DEFAULT_GPU_TYPE
+from ray.llm._internal.common.utils.download_utils import (
+    download_model_files,
+    NodeModelDownloadable,
+)
 from ray.llm._internal.batch.observability.usage_telemetry.usage import (
     get_or_create_telemetry_agent,
 )
@@ -215,7 +218,12 @@ def build_vllm_engine_processor(
             )
         )
 
-    model_path = download_hf_model(config.model_source, tokenizer_only=True)
+    model_path = download_model_files(
+        model_id=config.model_source,
+        mirror_config=None,
+        download_model=NodeModelDownloadable.TOKENIZER_ONLY,
+        download_extra_files=False,
+    )
     hf_config = transformers.AutoConfig.from_pretrained(model_path)
     architecture = getattr(hf_config, "architectures", [DEFAULT_MODEL_ARCHITECTURE])[0]
 
diff --git a/python/ray/llm/_internal/batch/stages/chat_template_stage.py b/python/ray/llm/_internal/batch/stages/chat_template_stage.py
@@ -6,7 +6,10 @@
     StatefulStage,
     StatefulStageUDF,
 )
-from ray.llm._internal.batch.utils import download_hf_model
+from ray.llm._internal.common.utils.download_utils import (
+    download_model_files,
+    NodeModelDownloadable,
+)
 
 
 class ChatTemplateUDF(StatefulStageUDF):
@@ -36,7 +39,12 @@ def __init__(
         # because tokenizers of VLM models may not have chat template attribute.
         # However, this may not be a reliable solution, because processors and
         # tokenizers are not standardized across different models.
-        model_path = download_hf_model(model, tokenizer_only=True)
+        model_path = download_model_files(
+            model_id=model,
+            mirror_config=None,
+            download_model=NodeModelDownloadable.TOKENIZER_ONLY,
+            download_extra_files=False,
+        )
         self.processor = AutoProcessor.from_pretrained(
             model_path, trust_remote_code=True
         )
diff --git a/python/ray/llm/_internal/batch/stages/tokenize_stage.py b/python/ray/llm/_internal/batch/stages/tokenize_stage.py
@@ -6,10 +6,11 @@
     StatefulStage,
     StatefulStageUDF,
 )
-from ray.llm._internal.batch.utils import (
-    get_cached_tokenizer,
-    download_hf_model,
+from ray.llm._internal.common.utils.download_utils import (
+    download_model_files,
+    NodeModelDownloadable,
 )
+from ray.llm._internal.batch.utils import get_cached_tokenizer
 
 
 class TokenizeUDF(StatefulStageUDF):
@@ -30,7 +31,12 @@ def __init__(
         from transformers import AutoTokenizer
 
         super().__init__(data_column, expected_input_keys)
-        model_path = download_hf_model(model, tokenizer_only=True)
+        model_path = download_model_files(
+            model_id=model,
+            mirror_config=None,
+            download_model=NodeModelDownloadable.TOKENIZER_ONLY,
+            download_extra_files=False,
+        )
         self.tokenizer = get_cached_tokenizer(
             AutoTokenizer.from_pretrained(
                 model_path,
@@ -88,7 +94,12 @@ def __init__(
         from transformers import AutoTokenizer
 
         super().__init__(data_column, expected_input_keys)
-        model_path = download_hf_model(model, tokenizer_only=True)
+        model_path = download_model_files(
+            model_id=model,
+            mirror_config=None,
+            download_model=NodeModelDownloadable.TOKENIZER_ONLY,
+            download_extra_files=False,
+        )
         self.tokenizer = get_cached_tokenizer(
             AutoTokenizer.from_pretrained(
                 model_path,
diff --git a/python/ray/llm/_internal/batch/stages/vllm_engine_stage.py b/python/ray/llm/_internal/batch/stages/vllm_engine_stage.py
@@ -17,11 +17,12 @@
     StatefulStage,
     StatefulStageUDF,
 )
-from ray.llm._internal.batch.utils import (
+from ray.llm._internal.common.utils.cloud_utils import is_remote_path
+from ray.llm._internal.common.utils.download_utils import (
     download_lora_adapter,
-    download_hf_model,
+    download_model_files,
+    NodeModelDownloadable,
 )
-from ray.llm._internal.common.utils.cloud_utils import is_remote_path
 from ray.llm._internal.utils import try_import
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
@@ -476,7 +477,12 @@ def __init__(
             logger.info("Max pending requests is set to %d", self.max_pending_requests)
 
         # Download the model if needed.
-        model_source = download_hf_model(self.model, tokenizer_only=False)
+        model_source = download_model_files(
+            model_id=self.model,
+            mirror_config=None,
+            download_model=NodeModelDownloadable.MODEL_AND_TOKENIZER,
+            download_extra_files=False,
+        )
 
         # Create an LLM engine.
         self.llm = vLLMEngineWrapper(
diff --git a/python/ray/llm/_internal/batch/utils.py b/python/ray/llm/_internal/batch/utils.py
@@ -1,13 +1,6 @@
 """Utility functions for batch processing."""
 import logging
-import os
-from typing import TYPE_CHECKING, Any, Optional, Union
-
-from ray.llm._internal.common.utils.cloud_utils import (
-    CloudMirrorConfig,
-    is_remote_path,
-)
-from ray.llm._internal.common.utils.download_utils import CloudModelDownloader
+from typing import TYPE_CHECKING, Any, Union
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -67,51 +60,3 @@ def __len__(self):
 
     tokenizer.__class__ = CachedTokenizer
     return tokenizer
-
-
-def download_hf_model(model_source: str, tokenizer_only: bool = True) -> str:
-    """Download the HF model from the model source.
-
-    Args:
-        model_source: The model source path.
-        tokenizer_only: Whether to download only the tokenizer.
-
-    Returns:
-        The local path to the downloaded model.
-    """
-
-    bucket_uri = None
-    if is_remote_path(model_source):
-        bucket_uri = model_source
-
-    mirror_config = CloudMirrorConfig(bucket_uri=bucket_uri)
-    downloader = CloudModelDownloader(model_source, mirror_config)
-    return downloader.get_model(tokenizer_only=tokenizer_only)
-
-
-def download_lora_adapter(
-    lora_name: str,
-    remote_path: Optional[str] = None,
-) -> str:
-    """If remote_path is specified, pull the lora to the local
-    directory and return the local path.
-
-    Args:
-        lora_name: The lora name.
-        remote_path: The remote path to the lora. If specified, the remote_path will be
-            used as the base path to load the lora.
-
-    Returns:
-        The local path to the lora if remote_path is specified, otherwise the lora name.
-    """
-    assert not is_remote_path(
-        lora_name
-    ), "lora_name cannot be a remote path (s3:// or gs://)"
-
-    if remote_path is None:
-        return lora_name
-
-    lora_path = os.path.join(remote_path, lora_name)
-    mirror_config = CloudMirrorConfig(bucket_uri=lora_path)
-    downloader = CloudModelDownloader(lora_name, mirror_config)
-    return downloader.get_model(tokenizer_only=False)
diff --git a/python/ray/llm/_internal/common/observability/logging/__init__.py b/python/ray/llm/_internal/common/observability/logging/__init__.py
@@ -0,0 +1,40 @@
+import logging
+from typing import Optional
+
+from ray._private.ray_logging.filters import CoreContextFilter
+
+
+def _setup_logger(logger_name: str):
+    """Setup logger given the logger name.
+
+    This function is idempotent and won't set up the same logger multiple times. It will
+    also skip the setup if logger is already setup and has handlers.
+
+    Args:
+        logger_name: logger name used to get the logger.
+    """
+    logger = logging.getLogger(logger_name)
+    llm_logger = logging.getLogger("ray.llm")
+
+    # Skip setup if the logger already has handlers setup or if the parent (Data
+    # logger) has handlers.
+    if logger.handlers or llm_logger.handlers:
+        return
+
+    # Set up stream handler, which logs to console as plaintext.
+    stream_handler = logging.StreamHandler()
+    stream_handler.addFilter(CoreContextFilter())
+    logger.addHandler(stream_handler)
+    logger.setLevel(logging.INFO)
+    logger.propagate = False
+
+
+def get_logger(name: Optional[str] = None):
+    """Get a structured logger inherited from the Ray Data logger.
+
+    Loggers by default are logging to stdout, and are expected to be scraped by an
+    external process.
+    """
+    logger_name = f"ray.llm.{name}"
+    _setup_logger(logger_name)
+    return logging.getLogger(logger_name)
diff --git a/python/ray/llm/_internal/common/observability/logging/setup.py b/python/ray/llm/_internal/common/observability/logging/setup.py
@@ -0,0 +1,26 @@
+import logging
+
+from ray._private.ray_logging.filters import CoreContextFilter
+from ray._private.ray_logging.formatters import JSONFormatter
+
+
+def _configure_stdlib_logging():
+    """Configures stdlib root logger to make sure stdlib loggers (created as
+    `logging.getLogger(...)`) are using Ray's `JSONFormatter` with Core and Serve
+     context filters.
+    """
+
+    handler = logging.StreamHandler()
+    handler.addFilter(CoreContextFilter())
+    handler.setFormatter(JSONFormatter())
+
+    root_logger = logging.getLogger()
+    # NOTE: It's crucial we reset all the handlers of the root logger,
+    #       to make sure that logs aren't emitted twice
+    root_logger.handlers = []
+    root_logger.addHandler(handler)
+    root_logger.setLevel(logging.INFO)
+
+
+def setup_logging():
+    _configure_stdlib_logging()
diff --git a/python/ray/llm/_internal/common/utils/cloud_utils.py b/python/ray/llm/_internal/common/utils/cloud_utils.py
@@ -19,7 +19,7 @@
 # Use pyarrow for cloud storage access
 import pyarrow.fs as pa_fs
 
-from ray.llm._internal.serve.observability.logging import get_logger
+from ray.llm._internal.common.observability.logging import get_logger
 from ray.llm._internal.common.base_pydantic import BaseModelExtended
 
 
diff --git a/python/ray/llm/_internal/common/utils/download_utils.py b/python/ray/llm/_internal/common/utils/download_utils.py
diff --git a/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py b/python/ray/llm/_internal/serve/deployments/utils/node_initialization_utils.py

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,6 @@`
`11`	`11`	`ProcessorConfig,`
`12`	`12`	`ProcessorBuilder,`
`13`	`13`	`)`
`14`		`-from ray.llm._internal.batch.utils import download_hf_model`
`15`	`14`	`from ray.llm._internal.batch.stages import (`
`16`	`15`	`vLLMEngineStage,`
`17`	`16`	`ChatTemplateStage,`
`@@ -25,6 +24,10 @@`
`25`	`24`	`BatchModelTelemetry,`
`26`	`25`	`)`
`27`	`26`	`from ray.llm._internal.common.observability.telemetry_utils import DEFAULT_GPU_TYPE`
	`27`	`+from ray.llm._internal.common.utils.download_utils import (`
	`28`	`+ download_model_files,`
	`29`	`+ NodeModelDownloadable,`
	`30`	`+)`
`28`	`31`	`from ray.llm._internal.batch.observability.usage_telemetry.usage import (`
`29`	`32`	`get_or_create_telemetry_agent,`
`30`	`33`	`)`
`@@ -215,7 +218,12 @@ def build_vllm_engine_processor(`
`215`	`218`	`)`
`216`	`219`	`)`
`217`	`220`
`218`		`- model_path = download_hf_model(config.model_source, tokenizer_only=True)`
	`221`	`+ model_path = download_model_files(`
	`222`	`+ model_id=config.model_source,`
	`223`	`+ mirror_config=None,`
	`224`	`+ download_model=NodeModelDownloadable.TOKENIZER_ONLY,`
	`225`	`+ download_extra_files=False,`
	`226`	`+ )`
`219`	`227`	`hf_config = transformers.AutoConfig.from_pretrained(model_path)`
`220`	`228`	`architecture = getattr(hf_config, "architectures", [DEFAULT_MODEL_ARCHITECTURE])[0]`
`221`	`229`