Merge pull request #1651 from basetenlabs/bump-version-0.9.92

rcano-baseten · web-flow · commit 8c28b26195b8 · 2025-05-16T17:12:22.000-04:00
Release 0.9.92
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "truss"
-version = "0.9.91"
+version = "0.9.92"
 description = "A seamless bridge from model development to model delivery"
 license = "MIT"
 readme = "README.md"
diff --git a/truss/base/trt_llm_config.py b/truss/base/trt_llm_config.py
@@ -4,7 +4,7 @@
 import os
 import warnings
 from enum import Enum
-from typing import TYPE_CHECKING, Annotated, Any, Dict, Literal, Optional
+from typing import TYPE_CHECKING, Annotated, Dict, Literal, Optional
 
 from huggingface_hub.errors import HFValidationError
 from huggingface_hub.utils import validate_repo_id
@@ -26,6 +26,13 @@
 ENGINE_BUILDER_TRUSS_RUNTIME_MIGRATION = (
     os.environ.get("ENGINE_BUILDER_TRUSS_RUNTIME_MIGRATION", "False") == "True"
 )
+try:
+    from truss.base import custom_types
+
+    PydanticTrTBaseModel = custom_types.ConfigModel
+except ImportError:
+    # fallback for briton
+    PydanticTrTBaseModel = BaseModel  # type: ignore[assignment,misc]
 
 
 class TrussTRTLLMModel(str, Enum):
@@ -54,13 +61,13 @@ class TrussTRTLLMQuantizationType(str, Enum):
     FP4_KV = "fp4_kv"
 
 
-class TrussTRTLLMPluginConfiguration(BaseModel):
+class TrussTRTLLMPluginConfiguration(PydanticTrTBaseModel):
     paged_kv_cache: bool = True
     use_paged_context_fmha: bool = True
     use_fp8_context_fmha: bool = False
 
 
-class TrussTRTQuantizationConfiguration(BaseModel):
+class TrussTRTQuantizationConfiguration(PydanticTrTBaseModel):
     """Configuration for quantization of TRT models
 
     Args:
@@ -96,7 +103,7 @@ class CheckpointSource(str, Enum):
     REMOTE_URL = "REMOTE_URL"
 
 
-class CheckpointRepository(BaseModel):
+class CheckpointRepository(PydanticTrTBaseModel):
     source: CheckpointSource
     repo: str
     revision: Optional[str] = None
@@ -125,7 +132,7 @@ class TrussSpecDecMode(str, Enum):
     LOOKAHEAD_DECODING = "LOOKAHEAD_DECODING"
 
 
-class TrussTRTLLMRuntimeConfiguration(BaseModel):
+class TrussTRTLLMRuntimeConfiguration(PydanticTrTBaseModel):
     kv_cache_free_gpu_mem_fraction: float = 0.9
     kv_cache_host_memory_bytes: Optional[Annotated[int, Field(strict=True, ge=1)]] = (
         None
@@ -144,12 +151,12 @@ class TrussTRTLLMRuntimeConfiguration(BaseModel):
     ] = None
 
 
-class TrussTRTLLMLoraConfiguration(BaseModel):
+class TrussTRTLLMLoraConfiguration(PydanticTrTBaseModel):
     max_lora_rank: int = 64
     lora_target_modules: list[str] = []
 
 
-class TrussTRTLLMBuildConfiguration(BaseModel):
+class TrussTRTLLMBuildConfiguration(PydanticTrTBaseModel):
     base_model: TrussTRTLLMModel = TrussTRTLLMModel.DECODER
     max_seq_len: Optional[Annotated[int, Field(strict=True, ge=1, le=1048576)]] = None
     max_batch_size: Annotated[int, Field(strict=True, ge=1, le=2048)] = 256
@@ -302,8 +309,14 @@ def max_draft_len(self) -> Optional[int]:
             return self.speculator.num_draft_tokens
         return None
 
+    @property
+    def parsed_trt_llm_build_configs(self) -> list["TrussTRTLLMBuildConfiguration"]:
+        if self.speculator and self.speculator.build:
+            return [self, self.speculator.build]
+        return [self]
+
 
-class TrussSpeculatorConfiguration(BaseModel):
+class TrussSpeculatorConfiguration(PydanticTrTBaseModel):
     speculative_decoding_mode: TrussSpecDecMode = TrussSpecDecMode.DRAFT_EXTERNAL
     num_draft_tokens: Optional[Annotated[int, Field(strict=True, ge=1)]] = None
     checkpoint_repository: Optional[CheckpointRepository] = None
@@ -408,7 +421,7 @@ def resolved_checkpoint_repository(self) -> CheckpointRepository:
             )
 
 
-class VersionsOverrides(BaseModel):
+class VersionsOverrides(PydanticTrTBaseModel):
     # If an override is specified, it takes precedence over the backend's current
     # default version. The version is used to create a full image ref and should look
     # like a semver, e.g. for the briton the version `0.17.0-fd30ac1` could be specified
@@ -418,8 +431,16 @@ class VersionsOverrides(BaseModel):
     briton_version: Optional[str] = None
     bei_version: Optional[str] = None
 
+    @model_validator(mode="before")
+    def version_must_start_with_number(cls, data):
+        for field in ["engine_builder_version", "briton_version", "bei_version"]:
+            v = data.get(field)
+            if v is not None and (not v or not v[0].isdigit()):
+                raise ValueError(f"{field.name} must start with a number")
+        return data
+
 
-class ImageVersions(BaseModel):
+class ImageVersions(PydanticTrTBaseModel):
     # Required versions for patching truss config during docker build setup.
     # The schema of this model must be such that it can parse the values serialized
     # from the backend. The inserted values are full image references, resolved using
@@ -428,50 +449,16 @@ class ImageVersions(BaseModel):
     briton_image: str
 
 
-class TRTLLMConfiguration(BaseModel):
-    runtime: TrussTRTLLMRuntimeConfiguration = TrussTRTLLMRuntimeConfiguration()
+class TRTLLMConfiguration(PydanticTrTBaseModel):
     build: TrussTRTLLMBuildConfiguration
+    runtime: TrussTRTLLMRuntimeConfiguration = TrussTRTLLMRuntimeConfiguration()
     # If versions are not set, the baseten backend will insert current defaults.
     version_overrides: VersionsOverrides = VersionsOverrides()
 
     def model_post_init(self, __context):
         self.add_bei_default_route()
         self.chunked_context_fix()
 
-    @model_validator(mode="before")
-    @classmethod
-    def migrate_runtime_fields(cls, data: Any) -> Any:
-        extra_runtime_fields = {}
-        valid_build_fields = {}
-        if isinstance(data.get("build"), dict):
-            for key, value in data.get("build").items():
-                if key in TrussTRTLLMBuildConfiguration.__annotations__:
-                    valid_build_fields[key] = value
-                else:
-                    if key in TrussTRTLLMRuntimeConfiguration.__annotations__:
-                        logger.warning(f"Found runtime.{key}: {value} in build config")
-                        extra_runtime_fields[key] = value
-            if extra_runtime_fields:
-                logger.warning(
-                    f"Found extra fields {list(extra_runtime_fields.keys())} in build configuration, unspecified runtime fields will be configured using these values."
-                    " This configuration of deprecated fields is scheduled for removal, please upgrade to the latest truss version and update configs according to https://docs.baseten.co/performance/engine-builder-config."
-                )
-                if data.get("runtime"):
-                    data.get("runtime").update(
-                        {
-                            k: v
-                            for k, v in extra_runtime_fields.items()
-                            if k not in data.get("runtime")
-                        }
-                    )
-                else:
-                    data.update(
-                        {"runtime": {k: v for k, v in extra_runtime_fields.items()}}
-                    )
-            data.update({"build": valid_build_fields})
-            return data
-        return data
-
     def chunked_context_fix(self: "TRTLLMConfiguration") -> "TRTLLMConfiguration":
         """check if there is an error wrt. runtime.enable_chunked_context"""
         if (
@@ -482,16 +469,8 @@ def chunked_context_fix(self: "TRTLLMConfiguration") -> "TRTLLMConfiguration":
                 and self.build.plugin_configuration.paged_kv_cache
             )
         ):
-            logger.warning(
-                "If trt_llm.runtime.enable_chunked_context is True, then trt_llm.build.plugin_configuration.use_paged_context_fmha and trt_llm.build.plugin_configuration.paged_kv_cache should be True. "
-                "Setting trt_llm.build.plugin_configuration.use_paged_context_fmha and trt_llm.build.plugin_configuration.paged_kv_cache to True."
-            )
-            self.build = self.build.model_copy(
-                update={
-                    "plugin_configuration": self.build.plugin_configuration.model_copy(
-                        update={"use_paged_context_fmha": True, "paged_kv_cache": True}
-                    )
-                }
+            raise ValueError(
+                "If trt_llm.runtime.enable_chunked_context is True, then trt_llm.build.plugin_configuration.use_paged_context_fmha and trt_llm.build.plugin_configuration.paged_kv_cache need to be True. "
             )
 
         return self
diff --git a/truss/base/truss_config.py b/truss/base/truss_config.py
@@ -611,16 +611,6 @@ class Config:
     def canonical_python_version(self) -> str:
         return to_dotted_python_version(self.python_version)
 
-    @property
-    def parsed_trt_llm_build_configs(
-        self,
-    ) -> list[trt_llm_config.TrussTRTLLMBuildConfiguration]:
-        if self.trt_llm:
-            if self.trt_llm.build.speculator and self.trt_llm.build.speculator.build:
-                return [self.trt_llm.build, self.trt_llm.build.speculator.build]
-            return [self.trt_llm.build]
-        return []
-
     def to_dict(self, verbose: bool = True) -> dict:
         self.runtime.sync_is_websocket()  # type: ignore[operator]  # This is callable.
         data = super().to_dict(verbose)
diff --git a/truss/cli/cli.py b/truss/cli/cli.py
@@ -50,7 +50,6 @@
 from truss.remote.remote_factory import USER_TRUSSRC_PATH, RemoteFactory
 from truss.trt_llm.config_checks import (
     has_no_tags_trt_llm_builder,
-    is_missing_secrets_for_trt_llm_builder,
     memory_updated_for_trt_llm_builder,
     uses_trt_llm_builder,
 )
@@ -153,6 +152,16 @@ def _get_required_option(ctx: click.Context, name: str) -> object:
     return value
 
 
+def _prepare_click_context(f: click.Command, params: dict) -> click.Context:
+    """create new click context for invoking a command via f.invoke(ctx)"""
+    current_ctx = click.get_current_context()
+    current_obj = current_ctx.find_root().obj
+
+    ctx = click.Context(f, obj=current_obj)
+    ctx.params = params
+    return ctx
+
+
 def _log_level_option(f: Callable[..., object]) -> Callable[..., object]:
     return click.option(
         "--log",
@@ -753,21 +762,20 @@ def push(
             console.print(live_reload_disabled_text, style="red")
             sys.exit(1)
 
-        if is_missing_secrets_for_trt_llm_builder(tr):
-            missing_token_text = (
-                "`hf_access_token` must be provided in secrets to build a gated model. "
-                "Please see https://docs.baseten.co/deploy/guides/private-model for configuration instructions."
-            )
-            console.print(missing_token_text, style="yellow")
         if memory_updated_for_trt_llm_builder(tr):
             console.print(
                 f"Automatically increasing memory for trt-llm builder to {TRTLLM_MIN_MEMORY_REQUEST_GI}Gi."
             )
-        message_oai = has_no_tags_trt_llm_builder(tr)
+        message_oai, raised_message_oai = has_no_tags_trt_llm_builder(tr)
         if message_oai:
-            console.print(message_oai, style="red")
-            sys.exit(1)
-        for trt_llm_build_config in tr.spec.config.parsed_trt_llm_build_configs:
+            console.print(message_oai, style="yellow")
+            if raised_message_oai:
+                console.print(message_oai, style="red")
+                sys.exit(1)
+
+        for (
+            trt_llm_build_config
+        ) in tr.spec.config.trt_llm.build.parsed_trt_llm_build_configs:
             if (
                 trt_llm_build_config.quantization_type
                 in [TrussTRTLLMQuantizationType.FP8, TrussTRTLLMQuantizationType.FP8_KV]
@@ -1722,19 +1730,18 @@ def deploy_checkpoints(
         ),
     )
 
-    ctx = click.Context(push, obj={})
-    ctx.params = {
+    params = {
         "target_directory": prepare_checkpoint_result.truss_directory,
         "remote": remote,
         "model_name": prepare_checkpoint_result.checkpoint_deploy_config.model_name,
         "publish": True,
         "deployment_name": prepare_checkpoint_result.checkpoint_deploy_config.deployment_name,
     }
+    ctx = _prepare_click_context(push, params)
     if dry_run:
         console.print("--dry-run flag provided, not deploying", style="yellow")
     else:
         push.invoke(ctx)
-
     train_cli.print_deploy_checkpoints_success_message(prepare_checkpoint_result)
 
 
diff --git a/truss/tests/conftest.py b/truss/tests/conftest.py
@@ -780,45 +780,16 @@ def deprecated_trtllm_config(default_config) -> Dict[str, Any]:
             "base_model": "llama",
             "max_seq_len": 2048,
             "max_batch_size": 512,
-            # start deprecated fields
-            "kv_cache_free_gpu_mem_fraction": 0.1,
-            "enable_chunked_context": True,
-            "batch_scheduler_policy": TrussTRTLLMBatchSchedulerPolicy.MAX_UTILIZATION.value,
-            "request_default_max_tokens": 10,
-            "total_token_limit": 50,
-            # end deprecated fields
             "checkpoint_repository": {"source": "HF", "repo": "meta/llama4-500B"},
             "gather_all_token_logits": False,
-        }
-    }
-    return trtllm_config
-
-
-@pytest.fixture
-def deprecated_trtllm_config_with_runtime_existing(default_config) -> Dict[str, Any]:
-    trtllm_config = default_config
-    trtllm_config["resources"] = {
-        "accelerator": Accelerator.L4.value,
-        "cpu": "1",
-        "memory": "24Gi",
-        "use_gpu": True,
-    }
-    trtllm_config["trt_llm"] = {
-        "build": {
-            "base_model": "llama",
-            "max_seq_len": 2048,
-            "max_batch_size": 512,
-            # start deprecated fields
+        },
+        "runtime": {
+            "total_token_limit": 100,
             "kv_cache_free_gpu_mem_fraction": 0.1,
             "enable_chunked_context": True,
-            "batch_scheduler_policy": TrussTRTLLMBatchSchedulerPolicy.MAX_UTILIZATION.value,
+            "batch_scheduler_policy": "max_utilization",
             "request_default_max_tokens": 10,
-            "total_token_limit": 50,
-            # end deprecated fields
-            "checkpoint_repository": {"source": "HF", "repo": "meta/llama4-500B"},
-            "gather_all_token_logits": False,
         },
-        "runtime": {"total_token_limit": 100},
     }
     return trtllm_config
 
diff --git a/truss/tests/test_data/test_streaming_truss_with_tracing/config.yaml b/truss/tests/test_data/test_streaming_truss_with_tracing/config.yaml
@@ -30,7 +30,7 @@ requirements: []
 requirements_file: null
 resources:
   accelerator: null
-  cpu: '1'
+  cpu: "1"
   memory: 2Gi
   use_gpu: false
 runtime:
@@ -46,7 +46,6 @@ runtime:
     kind: websocket
   truss_server_version_override: null
 secrets: {}
-spec_version: '2.0'
+spec_version: "2.0"
 system_packages: []
-trt_llm: null
 use_local_src: false
diff --git a/truss/tests/test_data/test_trt_llm_truss/config.yaml b/truss/tests/test_data/test_trt_llm_truss/config.yaml
@@ -1,4 +1,7 @@
 model_name: Test
+model_metadata:
+  tags:
+    - openai-compatible
 resources:
   accelerator: A100
   use_gpu: True
diff --git a/truss/tests/trt_llm/test_trt_llm_config.py b/truss/tests/trt_llm/test_trt_llm_config.py
diff --git a/truss/tests/util/test_config_checks.py b/truss/tests/util/test_config_checks.py
diff --git a/truss/trt_llm/config_checks.py b/truss/trt_llm/config_checks.py