[REFACTOR] Remove dependencies on legacy chat_module (#2424)

tqchen · web-flow · commit 8b38a4bcec2a · 2024-05-26T14:51:44.000-04:00
This PR removes the all dependencies from chat_module.py
So we can prepare for deprecating this module.

This PR refactors and moves MLCChatConfig to protocol.
This helps us to consolidate all API spec and config files
under the protocol folder.

The protocol folder mainly keeps the data schema and metadata,
most of the actions(gen_config) are still kept in their current location.
diff --git a/android/MLCEngineExample/mlc-package-config.json b/android/MLCEngineExample/mlc-package-config.json
@@ -4,7 +4,10 @@
         {
             "model": "HF://mlc-ai/phi-2-q4f16_1-MLC",
             "estimated_vram_bytes": 2036816936,
-            "model_id": "phi-2-q4f16_1"
+            "model_id": "phi-2-q4f16_1",
+            "overrides": {
+                "prefill_chunk_size": 1024
+            }
         }
     ]
 }
diff --git a/python/mlc_llm/chat_module.py b/python/mlc_llm/chat_module.py
@@ -783,7 +783,7 @@ def __init__(  # pylint: disable=too-many-arguments
 
             self.model_lib = jit.jit(
                 model_path=Path(self.model_path),
-                chat_config=asdict(self.chat_config),
+                overrides=asdict(self.chat_config),
                 device=self.device,
             ).model_lib_path
         _inspect_model_lib_metadata_memory_usage(self.model_lib, self.config_file_path)
diff --git a/python/mlc_llm/cli/delivery.py b/python/mlc_llm/cli/delivery.py
@@ -9,7 +9,7 @@
 import sys
 import tempfile
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from huggingface_hub import HfApi  # pylint: disable=import-error
 from huggingface_hub.utils import HfHubHTTPError  # pylint: disable=import-error
@@ -43,11 +43,11 @@ class ModelInfo:  # pylint: disable=too-many-instance-attributes
     source_format: str = "auto"
     # If unspecified in CLI, remains to be None and will not be
     # passed to `gen_config` or `convert_weight`
-    context_window_size: int = None
-    sliding_window_size: int = None
-    prefill_chunk_size: int = None
-    attention_sink_size: int = None
-    tensor_parallel_shards: int = None
+    context_window_size: Optional[int] = None
+    sliding_window_size: Optional[int] = None
+    prefill_chunk_size: Optional[int] = None
+    attention_sink_size: Optional[int] = None
+    tensor_parallel_shards: Optional[int] = None
 
 
 class DeferredScope:
diff --git a/python/mlc_llm/contrib/embeddings/embeddings.py b/python/mlc_llm/contrib/embeddings/embeddings.py
@@ -11,7 +11,6 @@
 from tvm.runtime import Device, Module
 from tvm.runtime.relax_vm import VirtualMachine
 
-from mlc_llm.chat_module import _get_model_path
 from mlc_llm.serve import engine_utils
 from mlc_llm.support.auto_device import detect_device
 from mlc_llm.tokenizer import Tokenizer
@@ -143,7 +142,7 @@ def __init__(  # pylint: disable=too-many-arguments
         self.mod, self.params, self.metadata = _get_tvm_module(
             model, model_lib_path, self.device, instrument
         )
-        self.model_path, _ = _get_model_path(model)
+        self.model_path = model
         self.tokenizer = Tokenizer(self.model_path)
         self.prefill_func = self.mod["prefill"]
 
diff --git a/python/mlc_llm/interface/gen_config.py b/python/mlc_llm/interface/gen_config.py
@@ -1,15 +1,15 @@
 """Generator of mlc-chat-config.json and tokenizer configuration."""
-
-import dataclasses
+# pylint: disable=E1101
 import json
 import re
 import shutil
 from dataclasses import asdict
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Optional
 
 from mlc_llm.conversation_template import ConvTemplateRegistry
 from mlc_llm.model import Model
+from mlc_llm.protocol.mlc_chat_config import MLCChatConfig
 from mlc_llm.quantization import Quantization
 from mlc_llm.support import convert_tiktoken, logging
 from mlc_llm.support.style import bold, green, red
@@ -22,63 +22,13 @@
 FOUND = green("Found")
 NOT_FOUND = red("Not found")
 FAILED = red("Failed")
-VERSION = "0.1.0"
-
-
-@dataclasses.dataclass
-class MLCChatConfig:  # pylint: disable=too-many-instance-attributes
-    """Fields in the dumped `mlc-chat-config.json` file."""
 
-    model_type: str
-    quantization: str
-    model_config: Dict[str, Any]
-    vocab_size: int
-    context_window_size: int
-    sliding_window_size: int
-    prefill_chunk_size: int
-    attention_sink_size: int
-    tensor_parallel_shards: int
-    # Control the behavior of the runtime
-    mean_gen_len: int = None
-    max_gen_len: int = None
-    shift_fill_factor: float = None
-    # Configuration of text generation
-    temperature: float = None
-    presence_penalty: float = None
-    frequency_penalty: float = None
-    repetition_penalty: float = None
-    top_p: float = None
-    # Conversation template
-    conv_template: Union[str, Dict[str, Any]] = None
-    pad_token_id: int = None
-    bos_token_id: int = None
-    eos_token_id: int = None
-    # Tokenizer configuration
-    tokenizer_files: List[str] = dataclasses.field(default_factory=list)
-    # The content of tokenizer.TokenizerInfo
-    tokenizer_info: Dict[str, Any] = dataclasses.field(default_factory=dict)
-    # Version control
-    version: str = VERSION
 
-    def apply_defaults(self) -> None:
-        """Apply system default value."""
-        defaults = {
-            "pad_token_id": 0,
-            "bos_token_id": 1,
-            "eos_token_id": 2,
-            "temperature": 0.7,
-            "presence_penalty": 0.0,
-            "frequency_penalty": 0.0,
-            "repetition_penalty": 1.0,
-            "top_p": 0.95,
-            "mean_gen_len": 128,
-            "max_gen_len": 512,
-            "shift_fill_factor": 0.3,
-        }
-        for key, value in defaults.items():
-            if getattr(self, key) is None:
-                setattr(self, key, value)
-                logger.info("[System default] Setting %s: %s", bold(key), value)
+def apply_system_defaults_for_missing_fields(mlc_chat_config: MLCChatConfig) -> None:
+    """Apply system default value."""
+    for key, value in mlc_chat_config.get_system_defaults_for_missing_fields().items():
+        setattr(mlc_chat_config, key, value)
+        logger.info("[System default] Setting %s: %s", bold(key), value)
 
 
 def check_string(s: str) -> bool:
@@ -265,10 +215,10 @@ def gen_config(  # pylint: disable=too-many-locals,too-many-arguments,too-many-b
     logger.info("Detected tokenizer info: %s", mlc_chat_config.tokenizer_info)
 
     # Step 4. Load system default value
-    mlc_chat_config.apply_defaults()
+    apply_system_defaults_for_missing_fields(mlc_chat_config)
     # Step 5. Dump the configuration file to output directory
     with (output / "mlc-chat-config.json").open("w", encoding="utf-8") as out_file:
-        json.dump(dataclasses.asdict(mlc_chat_config), out_file, indent=2)
+        json.dump(mlc_chat_config.model_dump(), out_file, indent=2)
         logger.info("Dumping configuration file to: %s", bold(out_file.name))
 
 
diff --git a/python/mlc_llm/interface/jit.py b/python/mlc_llm/interface/jit.py
@@ -49,7 +49,7 @@ def log_jit_policy():
 
 def jit(  # pylint: disable=too-many-locals,too-many-statements
     model_path: Path,
-    chat_config: Dict[str, Any],
+    overrides: Dict[str, Any],
     device: Union[Device, str],
     system_lib_prefix: Optional[str] = None,
     *,
@@ -70,7 +70,7 @@ def jit(  # pylint: disable=too-many-locals,too-many-statements
     lib_suffix = MLC_DSO_SUFFIX if device not in ["iphone", "android"] else "tar"
 
     def _get_optimization_flags() -> str:
-        opt = chat_config.pop("opt", None)
+        opt = overrides.pop("opt", None)
         if opt is None:
             opt = "O2"
         return repr(OptimizationFlags.from_str(opt))
@@ -79,7 +79,7 @@ def _get_overrides() -> str:
         forbid_list = ["context_window_size", "sliding_window_size", "attention_sink_size"]
         result = []
         for field in dataclasses.fields(ModelConfigOverride):
-            value = chat_config.get(field.name, None)
+            value = overrides.get(field.name, None)
             if value is not None:
                 if field.name in forbid_list and value == -1:
                     continue
@@ -92,7 +92,7 @@ def _get_model_config() -> Dict[str, Any]:
         model_config = mlc_chat_config.pop("model_config")
         model_config.update(mlc_chat_config)
         for field in dataclasses.fields(ModelConfigOverride):
-            value = chat_config.get(field.name, None)
+            value = overrides.get(field.name, None)
             if value is not None:
                 model_config[field.name] = value
         return MODELS[model_type].config.from_dict(model_config).asdict()
diff --git a/python/mlc_llm/interface/package.py b/python/mlc_llm/interface/package.py
@@ -6,13 +6,11 @@
 import shutil
 import subprocess
 import sys
-from dataclasses import asdict
 from pathlib import Path
 from typing import Any, Dict, List, Literal
 
-from mlc_llm.chat_module import ChatConfig, _get_chat_config, _get_model_path
 from mlc_llm.interface import jit
-from mlc_llm.support import logging, style
+from mlc_llm.support import download, logging, style
 
 logging.enable_logging()
 logger = logging.getLogger(__name__)
@@ -56,6 +54,7 @@ def build_model_library(  # pylint: disable=too-many-branches,too-many-locals,to
         bundle_weight = model_entry.get("bundle_weight", False)
         overrides = model_entry.get("overrides", {})
         model_lib = model_entry.get("model_lib", None)
+
         estimated_vram_bytes = model_entry["estimated_vram_bytes"]
         if not isinstance(model, str):
             raise ValueError('The value of "model" in "model_list" is expected to be a string.')
@@ -71,12 +70,8 @@ def build_model_library(  # pylint: disable=too-many-branches,too-many-locals,to
             raise ValueError('The value of "model_lib" in "model_list" is expected to be string.')
 
         # - Load model config. Download happens when needed.
-        model_path_and_config_file_path = _get_model_path(model)
-        model_path = Path(model_path_and_config_file_path[0])
-        config_file_path = model_path_and_config_file_path[1]
-        chat_config = _get_chat_config(
-            config_file_path, user_chat_config=ChatConfig.from_dict(overrides)
-        )
+        model_path = download.get_or_download_model(model)
+
         # - Jit compile if the model lib path is not specified.
         model_lib_path = (
             model_lib_path_for_prepare_libs.get(model_lib, None) if model_lib is not None else None
@@ -96,7 +91,7 @@ def build_model_library(  # pylint: disable=too-many-branches,too-many-locals,to
             model_lib_path, model_lib = dataclasses.astuple(
                 jit.jit(
                     model_path=model_path,
-                    chat_config=asdict(chat_config),
+                    overrides=overrides,
                     device=device,
                     system_lib_prefix=model_lib,
                     skip_log_jit_policy=True,
diff --git a/python/mlc_llm/op/position_embedding.py b/python/mlc_llm/op/position_embedding.py
@@ -176,7 +176,7 @@ def llama_rope_with_position_map(  # pylint: disable=too-many-arguments
     num_q_heads: int,
     num_kv_heads: int,
     dtype: str,
-    rotary_dim: int = None,
+    rotary_dim: Optional[int] = None,
 ):
     """Return the TIR function that computes Llama-style RoPE with q position map.
 
diff --git a/python/mlc_llm/protocol/__init__.py b/python/mlc_llm/protocol/__init__.py
@@ -1,4 +1,4 @@
-"""The protocols for MLC LLM server"""
+"""Definitions of pydantic models for API entry points and configurations"""
 from . import openai_api_protocol
 
 RequestProtocol = openai_api_protocol.CompletionRequest
diff --git a/python/mlc_llm/protocol/error_protocol.py b/python/mlc_llm/protocol/error_protocol.py
@@ -1,6 +1,7 @@
 """Error protocols in MLC LLM"""
 
 from http import HTTPStatus
+from typing import Optional
 
 import fastapi
 from pydantic import BaseModel
@@ -18,7 +19,7 @@ class ErrorResponse(BaseModel):
 
     object: str = "error"
     message: str
-    code: int = None
+    code: Optional[int] = None
 
 
 def create_error_response(status_code: HTTPStatus, message: str) -> fastapi.responses.JSONResponse:
diff --git a/python/mlc_llm/protocol/mlc_chat_config.py b/python/mlc_llm/protocol/mlc_chat_config.py
@@ -0,0 +1,82 @@
+# pylint: disable=too-many-instance-attributes
+"""Schema for mlc-chat-config"""
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+from mlc_llm.support.constants import MLC_CHAT_CONFIG_VERSION
+
+from .conversation_protocol import Conversation
+
+MLC_CHAT_SYSTEM_DEFAULT = {
+    "pad_token_id": 0,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "temperature": 1.0,
+    "presence_penalty": 0.0,
+    "frequency_penalty": 0.0,
+    "repetition_penalty": 1.0,
+    "top_p": 1.0,
+    "mean_gen_len": 128,
+    "max_gen_len": 512,
+    "shift_fill_factor": 0.3,
+}
+"""system default values."""
+
+
+class MLCChatConfig(BaseModel):
+    """Fields in the dumped `mlc-chat-config.json` file."""
+
+    # Version control
+    version: str = MLC_CHAT_CONFIG_VERSION
+
+    # use alias to avoid protected namespace conflict with pydantic
+    field_model_type: str = Field(alias="model_type")
+    quantization: str
+    # use alias to avoid protected namespace conflict with pydantic
+    field_model_config: Dict[str, Any] = Field(alias="model_config")
+    vocab_size: int
+    context_window_size: int
+    sliding_window_size: int
+    prefill_chunk_size: int
+    attention_sink_size: int
+    tensor_parallel_shards: int
+    # Configuration of text generation
+    temperature: Optional[float] = None
+    presence_penalty: Optional[float] = None
+    frequency_penalty: Optional[float] = None
+    repetition_penalty: Optional[float] = None
+    top_p: Optional[float] = None
+    # Tokenizer configuration
+    tokenizer_files: List[str] = Field(default_factory=list)
+    # The content of tokenizer.TokenizerInfo
+    tokenizer_info: Dict[str, Any] = Field(default_factory=dict)
+    # conversation template
+    conv_template: Conversation
+    # extra fields from generation_config.json
+    # NOTE: they are not being used for now in MLCEngine
+    # but we keep them for book-keep purposes
+    pad_token_id: Optional[int] = None
+    bos_token_id: Optional[int] = None
+    eos_token_id: Optional[int] = None
+    # Legacy fields
+    # Control the behavior of the runtime
+    # these fields will be deprecated soon
+    mean_gen_len: Optional[int] = None
+    max_gen_len: Optional[int] = None
+    shift_fill_factor: Optional[float] = None
+
+    def get_system_defaults_for_missing_fields(self) -> Dict[str, Any]:
+        """Apply system default value for fields that are None
+
+        Note
+        ----
+        We implement default setting in this way so we can lazily create
+        MLCChatConfig, override its optional values then
+        apply_system_defaults in the end.
+        """
+        res = {}
+        for key, value in MLC_CHAT_SYSTEM_DEFAULT.items():
+            if getattr(self, key) is None:
+                res[key] = value
+        return res
diff --git a/python/mlc_llm/serve/engine_base.py b/python/mlc_llm/serve/engine_base.py
diff --git a/python/mlc_llm/support/constants.py b/python/mlc_llm/support/constants.py
diff --git a/python/mlc_llm/support/download.py b/python/mlc_llm/support/download.py
diff --git a/python/mlc_llm/testing/debug_chat.py b/python/mlc_llm/testing/debug_chat.py

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,10 @@`
`4`	`4`	`{`
`5`	`5`	`"model": "HF://mlc-ai/phi-2-q4f16_1-MLC",`
`6`	`6`	`"estimated_vram_bytes": 2036816936,`
`7`		`- "model_id": "phi-2-q4f16_1"`
	`7`	`+ "model_id": "phi-2-q4f16_1",`
	`8`	`+ "overrides": {`
	`9`	`+ "prefill_chunk_size": 1024`
	`10`	`+ }`
`8`	`11`	`}`
`9`	`12`	`]`
`10`	`13`	`}`