ray-project
diff --git a/‎.vale/styles/config/vocabularies/Data/accept.txt
Lines changed: 1 addition & 0 deletions b/‎.vale/styles/config/vocabularies/Data/accept.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/source/data/working-with-llms.rst
Lines changed: 19 additions & 0 deletions b/‎doc/source/data/working-with-llms.rst
Lines changed: 19 additions & 0 deletions
diff --git a/‎doc/source/llm/examples/batch/vllm-with-lora.ipynb
Lines changed: 5 additions & 0 deletions b/‎doc/source/llm/examples/batch/vllm-with-lora.ipynb
Lines changed: 5 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/batch/processor/vllm_engine_proc.py
Lines changed: 10 additions & 0 deletions b/‎python/ray/llm/_internal/batch/processor/vllm_engine_proc.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/batch/stages/chat_template_stage.py
Lines changed: 5 additions & 1 deletion b/‎python/ray/llm/_internal/batch/stages/chat_template_stage.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎python/ray/llm/_internal/batch/stages/tokenize_stage.py
Lines changed: 8 additions & 3 deletions b/‎python/ray/llm/_internal/batch/stages/tokenize_stage.py
Lines changed: 8 additions & 3 deletions
diff --git a/‎python/ray/llm/_internal/batch/stages/vllm_engine_stage.py
Lines changed: 72 additions & 20 deletions b/‎python/ray/llm/_internal/batch/stages/vllm_engine_stage.py
Lines changed: 72 additions & 20 deletions
diff --git a/‎python/ray/llm/_internal/batch/utils.py
Lines changed: 56 additions & 1 deletion b/‎python/ray/llm/_internal/batch/utils.py
Lines changed: 56 additions & 1 deletion
@@ -24,6 +24,7 @@ Predibase('s)?
 [Pp]reprocess
 [Pp]reprocessor(s)?
 [Pp]ushdown
+runai
 [Ss]calers
 Spotify('s)?
 TFRecord(s)?
 
@@ -88,6 +88,25 @@ Some models may require a Hugging Face token to be specified. You can specify th
         batch_size=64,
     )
 
+If your model is hosted on AWS S3, you can specify the S3 path in the `model_source` argument, and specify `load_format="runai_streamer"` in the `engine_kwargs` argument.
+
+.. note::
+    Install vLLM with runai dependencies: `pip install -U "vllm[runai]==0.7.2"`
+
+.. testcode::
+
+    config = vLLMEngineProcessorConfig(
+        model_source="s3://your-bucket/your-model/",  # Make sure adding the trailing slash!
+        engine_kwargs={"load_format": "runai_streamer"},
+        runtime_env={"env_vars": {
+            "AWS_ACCESS_KEY_ID": "your_access_key_id",
+            "AWS_SECRET_ACCESS_KEY": "your_secret_access_key",
+            "AWS_REGION": "your_region",
+        }},
+        concurrency=1,
+        batch_size=64,
+    )
+
 .. _vllm_llm:
 
 Configure vLLM for LLM inference
 
@@ -55,6 +55,8 @@
     "    batch_size=16,\n",
     "    # Use one GPU in this example.\n",
     "    concurrency=1,\n",
+    "    # If you save the LoRA adapter in S3, you can set the following path.\n",
+    "    # dynamic_lora_loading_path=\"s3://your-lora-bucket/\",\n",
     ")\n",
     "\n",
     "# 2. Construct a processor using the processor config.\n",
@@ -66,6 +68,9 @@
     "        # from the model you specify in the processor config, then this\n",
     "        # is the LoRA adapter. The \"model\" here can be a LoRA adapter\n",
     "        # available in the HuggingFace Hub or a local path.\n",
+    "        #\n",
+    "        # If you set dynamic_lora_loading_path, then only specify the LoRA\n",
+    "        # path under dynamic_lora_loading_path.\n",
     "        model=\"EdBergJr/Llama32_Baha_3\",\n",
     "        messages=[\n",
     "            {\"role\": \"system\",\n",
 
@@ -92,6 +92,15 @@ class vLLMEngineProcessorConfig(ProcessorConfig):
         description="Whether the input messages have images.",
     )
 
+    # LoRA configurations.
+    dynamic_lora_loading_path: Optional[str] = Field(
+        default=None,
+        description="The path to the dynamic LoRA adapter. It is expected "
+        "to hold subfolders each for a different lora checkpoint. If not "
+        "specified and LoRA is enabled, then the 'model' in LoRA "
+        "requests will be interpreted as model ID used by HF transformers.",
+    )
+
     @root_validator(pre=True)
     def validate_task_type(cls, values):
         task_type_str = values.get("task_type", "generate")
@@ -169,6 +178,7 @@ def build_vllm_engine_processor(
                 engine_kwargs=config.engine_kwargs,
                 task_type=config.task_type,
                 max_pending_requests=config.max_pending_requests,
+                dynamic_lora_loading_path=config.dynamic_lora_loading_path,
             ),
             map_batches_kwargs=dict(
                 zero_copy_batch=True,
 
@@ -6,6 +6,7 @@
     StatefulStage,
     StatefulStageUDF,
 )
+from ray.llm._internal.batch.utils import download_hf_model
 
 
 class ChatTemplateUDF(StatefulStageUDF):
@@ -33,7 +34,10 @@ def __init__(
         # because tokenizers of VLM models may not have chat template attribute.
         # However, this may not be a reliable solution, because processors and
         # tokenizers are not standardized across different models.
-        self.processor = AutoProcessor.from_pretrained(model, trust_remote_code=True)
+        model_path = download_hf_model(model, tokenizer_only=True)
+        self.processor = AutoProcessor.from_pretrained(
+            model_path, trust_remote_code=True
+        )
         self.chat_template = chat_template
 
     async def udf(self, batch: List[Dict[str, Any]]) -> AsyncIterator[Dict[str, Any]]:
 
@@ -6,7 +6,10 @@
     StatefulStage,
     StatefulStageUDF,
 )
-from ray.llm._internal.batch.utils import get_cached_tokenizer
+from ray.llm._internal.batch.utils import (
+    get_cached_tokenizer,
+    download_hf_model,
+)
 
 
 class TokenizeUDF(StatefulStageUDF):
@@ -25,9 +28,10 @@ def __init__(
         from transformers import AutoTokenizer
 
         super().__init__(data_column)
+        model_path = download_hf_model(model, tokenizer_only=True)
         self.tokenizer = get_cached_tokenizer(
             AutoTokenizer.from_pretrained(
-                model,
+                model_path,
                 trust_remote_code=True,
             )
         )
@@ -81,9 +85,10 @@ def __init__(
         from transformers import AutoTokenizer
 
         super().__init__(data_column)
+        model_path = download_hf_model(model, tokenizer_only=True)
         self.tokenizer = get_cached_tokenizer(
             AutoTokenizer.from_pretrained(
-                model,
+                model_path,
                 trust_remote_code=True,
             )
         )
 
@@ -17,6 +17,11 @@
     StatefulStage,
     StatefulStageUDF,
 )
+from ray.llm._internal.batch.utils import (
+    download_lora_adapter,
+    download_hf_model,
+)
+from ray.llm._internal.common.utils.cloud_utils import is_remote_path
 from ray.llm._internal.utils import try_import
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
@@ -119,22 +124,30 @@ class vLLMEngineWrapper:
     Args:
         *args: The positional arguments for the engine.
         max_pending_requests: The maximum number of pending requests in the queue.
+        dynamic_lora_loading_path: The S3 path to the dynamic LoRA adapter.
         **kwargs: The keyword arguments for the engine.
     """
 
     def __init__(
         self,
         idx_in_batch_column: str,
         max_pending_requests: int = -1,
+        dynamic_lora_loading_path: Optional[str] = None,
         **kwargs,
     ):
         self.request_id = 0
         self.idx_in_batch_column = idx_in_batch_column
         self.task_type = kwargs.get("task", vLLMTaskType.GENERATE)
-        self.model = kwargs.get("model", None)
-        assert self.model is not None
+
+        # Use model_source in kwargs["model"] because "model" is actually
+        # the model source in vLLM.
+        self.model = kwargs.pop("model", None)
+        self.model_source = kwargs.pop("model_source", None)
+        assert self.model is not None and self.model_source is not None
+        kwargs["model"] = self.model_source
 
         # LoRA related.
+        self.dynamic_lora_loading_path = dynamic_lora_loading_path
         self.lora_lock = asyncio.Lock()
         self.lora_name_to_request = {}
 
@@ -196,47 +209,77 @@ def _maybe_convert_ndarray_to_list(self, params: Dict[str, Any]) -> Dict[str, An
             return params.tolist()
         return params
 
-    async def _prepare_llm_request(self, row: Dict[str, Any]) -> vLLMEngineRequest:
-        """Prepare the inputs for LLM inference.
+    async def _maybe_get_lora_request(
+        self,
+        row: Dict[str, Any],
+    ) -> Optional[Any]:
+        """Get the LoRA request for the given row.
+        Specifically, if the model name is given and is different from the model
+        set in the config, then this request has LoRA.
 
         Args:
             row: The row.
 
         Returns:
-            A single vLLMEngineRequest.
+            The LoRA request (vllm.lora.request.LoRARequest),
+            or None if there is no LoRA. We use Any in type hint to
+            pass doc build in the environment without vLLM.
         """
-        prompt = row.pop("prompt")
-
-        if "tokenized_prompt" in row:
-            tokenized_prompt = row.pop("tokenized_prompt").tolist()
-        else:
-            tokenized_prompt = None
-
-        if "image" in row:
-            image = row.pop("image")
-        else:
-            image = []
-
-        # If the model name is given and is different from the model
-        # set in the config, then this is a LoRA.
         lora_request = None
         if "model" in row and row["model"] != self.model:
             if self.vllm_use_v1:
                 raise ValueError("LoRA is only supported with vLLM v0")
 
             lora_name = row["model"]
             if lora_name not in self.lora_name_to_request:
+                if is_remote_path(lora_name):
+                    raise ValueError(
+                        "LoRA name cannot be a remote path (s3:// or gs://). "
+                        "Please specify dynamic_lora_loading_path in the processor config."
+                    )
+
                 async with self.lora_lock:
                     if lora_name not in self.lora_name_to_request:
                         # Load a new LoRA adapter if it is not loaded yet.
+                        lora_path = download_lora_adapter(
+                            lora_name,
+                            remote_path=self.dynamic_lora_loading_path,
+                        )
+                        logger.info(
+                            "Downloaded LoRA adapter for %s to %s", lora_name, lora_path
+                        )
                         lora_request = vllm.lora.request.LoRARequest(
                             lora_name=lora_name,
                             # LoRA ID starts from 1.
                             lora_int_id=len(self.lora_name_to_request) + 1,
-                            lora_path=lora_name,
+                            lora_path=lora_path,
                         )
                         self.lora_name_to_request[lora_name] = lora_request
             lora_request = self.lora_name_to_request[lora_name]
+        return lora_request
+
+    async def _prepare_llm_request(self, row: Dict[str, Any]) -> vLLMEngineRequest:
+        """Prepare the inputs for LLM inference.
+
+        Args:
+            row: The row.
+
+        Returns:
+            A single vLLMEngineRequest.
+        """
+        prompt = row.pop("prompt")
+
+        if "tokenized_prompt" in row:
+            tokenized_prompt = row.pop("tokenized_prompt").tolist()
+        else:
+            tokenized_prompt = None
+
+        if "image" in row:
+            image = row.pop("image")
+        else:
+            image = []
+
+        lora_request = await self._maybe_get_lora_request(row)
 
         # Prepare sampling parameters.
         if self.task_type == vLLMTaskType.GENERATE:
@@ -396,6 +439,7 @@ def __init__(
         engine_kwargs: Dict[str, Any],
         task_type: vLLMTaskType = vLLMTaskType.GENERATE,
         max_pending_requests: Optional[int] = None,
+        dynamic_lora_loading_path: Optional[str] = None,
     ):
         """
         Initialize the vLLMEngineStageUDF.
@@ -407,6 +451,8 @@ def __init__(
             task_type: The task to use for the vLLM engine (e.g., "generate", "embed", etc).
             max_pending_requests: The maximum number of pending requests. If None,
                 it will be set to 1.1 * max_num_seqs * pipeline_parallel_size.
+            dynamic_lora_loading_path: The path to the dynamic LoRA adapter. It is expected
+                to hold subfolders each for a different lora checkpoint.
         """
         super().__init__(data_column)
         self.model = model
@@ -423,12 +469,17 @@ def __init__(
         if self.max_pending_requests > 0:
             logger.info("Max pending requests is set to %d", self.max_pending_requests)
 
+        # Download the model if needed.
+        model_source = download_hf_model(self.model, tokenizer_only=False)
+
         # Create an LLM engine.
         self.llm = vLLMEngineWrapper(
             model=self.model,
+            model_source=model_source,
             idx_in_batch_column=self.IDX_IN_BATCH_COLUMN,
             disable_log_stats=False,
             max_pending_requests=self.max_pending_requests,
+            dynamic_lora_loading_path=dynamic_lora_loading_path,
             **self.engine_kwargs,
         )
 
@@ -518,6 +569,7 @@ def expected_input_keys(self) -> List[str]:
 
     def __del__(self):
         if hasattr(self, "llm"):
+            # Kill the engine processes.
             self.llm.shutdown()
 
 
 
@@ -1,6 +1,13 @@
 """Utility functions for batch processing."""
 import logging
-from typing import TYPE_CHECKING, Any, Union
+import os
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+from ray.llm._internal.common.utils.cloud_utils import (
+    CloudMirrorConfig,
+    is_remote_path,
+)
+from ray.llm._internal.common.utils.download_utils import CloudModelDownloader
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -60,3 +67,51 @@ def __len__(self):
 
     tokenizer.__class__ = CachedTokenizer
     return tokenizer
+
+
+def download_hf_model(model_source: str, tokenizer_only: bool = True) -> str:
+    """Download the HF model from the model source.
+
+    Args:
+        model_source: The model source path.
+        tokenizer_only: Whether to download only the tokenizer.
+
+    Returns:
+        The local path to the downloaded model.
+    """
+
+    bucket_uri = None
+    if is_remote_path(model_source):
+        bucket_uri = model_source
+
+    mirror_config = CloudMirrorConfig(bucket_uri=bucket_uri)
+    downloader = CloudModelDownloader(model_source, mirror_config)
+    return downloader.get_model(tokenizer_only=tokenizer_only)
+
+
+def download_lora_adapter(
+    lora_name: str,
+    remote_path: Optional[str] = None,
+) -> str:
+    """If remote_path is specified, pull the lora to the local
+    directory and return the local path.
+
+    Args:
+        lora_name: The lora name.
+        remote_path: The remote path to the lora. If specified, the remote_path will be
+            used as the base path to load the lora.
+
+    Returns:
+        The local path to the lora if remote_path is specified, otherwise the lora name.
+    """
+    assert not is_remote_path(
+        lora_name
+    ), "lora_name cannot be a remote path (s3:// or gs://)"
+
+    if remote_path is None:
+        return lora_name
+
+    lora_path = os.path.join(remote_path, lora_name)
+    mirror_config = CloudMirrorConfig(bucket_uri=lora_path)
+    downloader = CloudModelDownloader(lora_name, mirror_config)
+    return downloader.get_model(tokenizer_only=False)