mlc-ai
diff --git a/‎python/mlc_llm/serve/engine.py
Lines changed: 24 additions & 13 deletions b/‎python/mlc_llm/serve/engine.py
Lines changed: 24 additions & 13 deletions
diff --git a/‎python/mlc_llm/support/constants.py
Lines changed: 4 additions & 1 deletion b/‎python/mlc_llm/support/constants.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎python/mlc_llm/testing/pytest_utils.py
Lines changed: 43 additions & 18 deletions b/‎python/mlc_llm/testing/pytest_utils.py
Lines changed: 43 additions & 18 deletions
diff --git a/‎tests/python/serve/test_serve_async_engine.py
Lines changed: 13 additions & 10 deletions b/‎tests/python/serve/test_serve_async_engine.py
Lines changed: 13 additions & 10 deletions
diff --git a/‎tests/python/serve/test_serve_async_engine_spec.py
Lines changed: 6 additions & 3 deletions b/‎tests/python/serve/test_serve_async_engine_spec.py
Lines changed: 6 additions & 3 deletions
@@ -32,16 +32,21 @@
 logger = logging.getLogger(__name__)
 
 
+# Note: we define both AsyncChat and Chat for Python type analysis.
+class AsyncChat:  # pylint: disable=too-few-public-methods
+    """The proxy class to direct to async chat completions."""
+
+    def __init__(self, engine: weakref.ReferenceType) -> None:
+        assert isinstance(engine(), AsyncMLCEngine)
+        self.completions = AsyncChatCompletion(engine)
+
+
 class Chat:  # pylint: disable=too-few-public-methods
     """The proxy class to direct to chat completions."""
 
     def __init__(self, engine: weakref.ReferenceType) -> None:
-        assert isinstance(engine(), (AsyncMLCEngine, MLCEngine))
-        self.completions = (
-            AsyncChatCompletion(engine)  # type: ignore
-            if isinstance(engine(), AsyncMLCEngine)
-            else ChatCompletion(engine)  # type: ignore
-        )
+        assert isinstance(engine(), MLCEngine)
+        self.completions = ChatCompletion(engine)
 
 
 class AsyncChatCompletion:  # pylint: disable=too-few-public-methods
@@ -151,7 +156,7 @@ async def create(  # pylint: disable=too-many-arguments,too-many-locals
             Extra debug options to pass to the request.
 
         Returns
-        ------
+        -------
         response : ChatCompletionResponse
             The chat completion response conforming to OpenAI API.
             See mlc_llm/protocol/openai_api_protocol.py or
@@ -643,7 +648,7 @@ def create(  # pylint: disable=too-many-arguments,too-many-locals
         response_format: Optional[Dict[str, Any]] = None,
         request_id: Optional[str] = None,
         debug_config: Optional[Dict[str, Any]] = None,
-    ) -> openai_api_protocol.CompletionResponse:
+    ) -> Iterator[openai_api_protocol.CompletionResponse]:
         """Synchronous streaming completion interface with OpenAI API compatibility.
         The method streams back CompletionResponse that conforms to
         OpenAI API one at a time via yield.
@@ -698,7 +703,7 @@ def create(  # pylint: disable=too-many-arguments,too-many-locals
         response_format: Optional[Dict[str, Any]] = None,
         request_id: Optional[str] = None,
         debug_config: Optional[Dict[str, Any]] = None,
-    ) -> Iterator[openai_api_protocol.CompletionResponse]:
+    ) -> openai_api_protocol.CompletionResponse:
         """Synchronous non-streaming completion interface with OpenAI API compatibility.
 
         See https://platform.openai.com/docs/api-reference/completions/create for specification.
@@ -714,7 +719,7 @@ def create(  # pylint: disable=too-many-arguments,too-many-locals
             Extra debug options to pass to the request.
 
         Returns
-        ------
+        -------
         response : CompletionResponse
             The completion response conforming to OpenAI API.
             See mlc_llm/protocol/openai_api_protocol.py or
@@ -750,7 +755,10 @@ def create(  # pylint: disable=too-many-arguments,too-many-locals
         response_format: Optional[Dict[str, Any]] = None,
         request_id: Optional[str] = None,
         debug_config: Optional[Dict[str, Any]] = None,
-    ) -> Iterator[openai_api_protocol.CompletionResponse]:
+    ) -> Union[
+        Iterator[openai_api_protocol.CompletionResponse],
+        openai_api_protocol.CompletionResponse,
+    ]:
         """Synchronous completion interface with OpenAI API compatibility.
 
         See https://platform.openai.com/docs/api-reference/completions/create for specification.
@@ -864,7 +872,7 @@ def __init__(  # pylint: disable=too-many-arguments,too-many-locals
             engine_config=engine_config,
             enable_tracing=enable_tracing,
         )
-        self.chat = Chat(weakref.ref(self))
+        self.chat = AsyncChat(weakref.ref(self))
         self.completions = AsyncCompletion(weakref.ref(self))
 
     async def abort(self, request_id: str) -> None:
@@ -1568,7 +1576,10 @@ def _completion(  # pylint: disable=too-many-arguments,too-many-locals
         response_format: Optional[Dict[str, Any]] = None,
         request_id: Optional[str] = None,
         debug_config: Optional[Dict[str, Any]] = None,
-    ) -> Iterator[openai_api_protocol.CompletionResponse]:
+    ) -> Union[
+        Iterator[openai_api_protocol.CompletionResponse],
+        openai_api_protocol.CompletionResponse,
+    ]:
         """Synchronous completion internal interface with OpenAI API compatibility.
 
         See https://platform.openai.com/docs/api-reference/completions/create for specification.
 
@@ -53,7 +53,10 @@ def _get_test_model_path() -> List[Path]:
     # by default, we reuse the cache dir via mlc_llm chat
     # note that we do not auto download for testcase
     # to avoid networking dependencies
-    return [_get_cache_dir() / "model_weights" / "mlc-ai"]
+    return [
+        _get_cache_dir() / "model_weights" / "mlc-ai",
+        Path(os.path.abspath(os.path.curdir)),
+    ]
 
 
 MLC_TEMP_DIR = os.getenv("MLC_TEMP_DIR", None)
 
@@ -1,14 +1,16 @@
 """Extra utilities to mark tests"""
 
 import functools
+import inspect
+from pathlib import Path
 from typing import Callable
 
 import pytest
 
 from mlc_llm.support.constants import MLC_TEST_MODEL_PATH
 
 
-def require_test_model(model: str):
+def require_test_model(*models: str):
     """Testcase decorator to require a model
 
     Examples
@@ -24,31 +26,54 @@ def test_reload_reset_unload(model):
 
     Parameters
     ----------
-    model : str
-        The model dir name
+    models : List[str]
+        The model directories or URLs.
     """
-    model_path = None
-    for base_path in MLC_TEST_MODEL_PATH:
-        if (base_path / model / "mlc-chat-config.json").is_file():
-            model_path = base_path / model
-    missing_model = model_path is None
+    model_paths = []
+    missing_models = []
+
+    for model in models:
+        model_path = None
+        for base_path in MLC_TEST_MODEL_PATH:
+            if (base_path / model / "mlc-chat-config.json").is_file():
+                model_path = base_path / model
+        if model_path is None and (Path(model) / "mlc-chat-config.json").is_file():
+            model_path = Path(model)
+
+        if model_path is None:
+            missing_models.append(model)
+        else:
+            model_paths.append(str(model_path))
+
     message = (
-        f"Model {model} does not exist in candidate paths {[str(p) for p in MLC_TEST_MODEL_PATH]},"
+        f"Model {', '.join(missing_models)} not found in candidate paths "
+        f"{[str(p) for p in MLC_TEST_MODEL_PATH]},"
         " if you set MLC_TEST_MODEL_PATH, please ensure model paths are in the right location,"
         " by default we reuse cache, try to run mlc_llm chat to download right set of models."
     )
 
-    def _decorator(func: Callable[[str], None]):
-        wrapped = functools.partial(func, str(model_path))
+    def _decorator(func: Callable[..., None]):
+        wrapped = functools.partial(func, *model_paths)
         wrapped.__name__ = func.__name__  # type: ignore
 
-        @functools.wraps(wrapped)
-        def wrapper(*args, **kwargs):
-            if missing_model:
-                print(f"{message} skipping...")
-                return
-            wrapped(*args, **kwargs)
+        if inspect.iscoroutinefunction(wrapped):
+            # The function is a coroutine function ("async def func(...)")
+            @functools.wraps(wrapped)
+            async def wrapper(*args, **kwargs):
+                if len(missing_models) > 0:
+                    print(f"{message} skipping...")
+                    return
+                await wrapped(*args, **kwargs)
+
+        else:
+            # The function is a normal function ("def func(...)")
+            @functools.wraps(wrapped)
+            def wrapper(*args, **kwargs):
+                if len(missing_models) > 0:
+                    print(f"{message} skipping...")
+                    return
+                wrapped(*args, **kwargs)
 
-        return pytest.mark.skipif(missing_model, reason=message)(wrapper)
+        return pytest.mark.skipif(len(missing_models) > 0, reason=message)(wrapper)
 
     return _decorator
@@ -4,6 +4,7 @@
 from typing import List
 
 from mlc_llm.serve import AsyncMLCEngine, EngineConfig, GenerationConfig
+from mlc_llm.testing import require_test_model
 
 prompts = [
     "What is the meaning of life?",
@@ -19,9 +20,9 @@
 ]
 
 
-async def test_engine_generate():
+@require_test_model("Llama-2-7b-chat-hf-q0f16-MLC")
+async def test_engine_generate(model: str):
     # Create engine
-    model = "HF://mlc-ai/Llama-2-7b-chat-hf-q0f16-MLC"
     async_engine = AsyncMLCEngine(
         model=model,
         mode="server",
@@ -74,9 +75,9 @@ async def generate_task(
     del async_engine
 
 
-async def test_chat_completion():
+@require_test_model("Llama-2-7b-chat-hf-q0f16-MLC")
+async def test_chat_completion(model: str):
     # Create engine
-    model = "HF://mlc-ai/Llama-2-7b-chat-hf-q0f16-MLC"
     async_engine = AsyncMLCEngine(
         model=model,
         mode="server",
@@ -101,6 +102,7 @@ async def generate_task(prompt: str, request_id: str):
         ):
             for choice in response.choices:
                 assert choice.delta.role == "assistant"
+                assert isinstance(choice.delta.content, str)
                 output_texts[rid][choice.index] += choice.delta.content
 
     tasks = [
@@ -124,9 +126,9 @@ async def generate_task(prompt: str, request_id: str):
     del async_engine
 
 
-async def test_chat_completion_non_stream():
+@require_test_model("Llama-2-7b-chat-hf-q0f16-MLC")
+async def test_chat_completion_non_stream(model: str):
     # Create engine
-    model = "HF://mlc-ai/Llama-2-7b-chat-hf-q0f16-MLC"
     async_engine = AsyncMLCEngine(
         model=model,
         mode="server",
@@ -150,6 +152,7 @@ async def generate_task(prompt: str, request_id: str):
         )
         for choice in response.choices:
             assert choice.message.role == "assistant"
+            assert isinstance(choice.message.content, str)
             output_texts[rid][choice.index] += choice.message.content
 
     tasks = [
@@ -173,9 +176,9 @@ async def generate_task(prompt: str, request_id: str):
     del async_engine
 
 
-async def test_completion():
+@require_test_model("Llama-2-7b-chat-hf-q0f16-MLC")
+async def test_completion(model: str):
     # Create engine
-    model = "HF://mlc-ai/Llama-2-7b-chat-hf-q0f16-MLC"
     async_engine = AsyncMLCEngine(
         model=model,
         mode="server",
@@ -223,9 +226,9 @@ async def generate_task(prompt: str, request_id: str):
     del async_engine
 
 
-async def test_completion_non_stream():
+@require_test_model("Llama-2-7b-chat-hf-q0f16-MLC")
+async def test_completion_non_stream(model: str):
     # Create engine
-    model = "HF://mlc-ai/Llama-2-7b-chat-hf-q0f16-MLC"
     async_engine = AsyncMLCEngine(
         model=model,
         mode="server",
 
@@ -4,6 +4,7 @@
 from typing import List
 
 from mlc_llm.serve import AsyncMLCEngine, EngineConfig, GenerationConfig
+from mlc_llm.testing import require_test_model
 
 prompts = [
     "What is the meaning of life?",
@@ -19,10 +20,12 @@
 ]
 
 
-async def test_engine_generate():
+@require_test_model(
+    "Llama-2-7b-chat-hf-q0f16-MLC",
+    "Llama-2-7b-chat-hf-q4f16_1-MLC",
+)
+async def test_engine_generate(model: str, small_model: str):
     # Create engine
-    model = "HF://mlc-ai/Llama-2-7b-chat-hf-q0f16-MLC"
-    small_model = "HF://mlc-ai/Llama-2-7b-chat-hf-q4f16_1-MLC"
     async_engine = AsyncMLCEngine(
         model=model,
         mode="server",