[Fix] Fix ignore_eos support (#2414)

MasterJH5574 · web-flow · commit cfc05978bccb · 2024-05-24T19:30:44.000-04:00
The ignore_eos support was broken during recent refactors. This PR
fixes the support.
diff --git a/cpp/serve/engine.cc b/cpp/serve/engine.cc
@@ -599,7 +599,7 @@ class EngineModule : public ModuleNode {
   void AddRequest(Request request) { return GetEngine()->AddRequest(std::move(request)); }
   /*! \brief Redirection to `Engine::AbortRequest`. */
   void Abort(const String& request_id) { return GetEngine()->AbortRequest(request_id); }
-
+  /*! \brief Create request with given arguments and the engine default generation config. */
   Request CreateRequest(String id, Array<Data> inputs, String generation_cfg_json_str) {
     auto gen_config =
         GenerationConfig::FromJSON(std::move(generation_cfg_json_str), default_generation_config_);
diff --git a/python/mlc_llm/protocol/openai_api_protocol.py b/python/mlc_llm/protocol/openai_api_protocol.py
@@ -391,6 +391,9 @@ def openai_api_get_generation_config(
 ) -> Dict[str, Any]:
     """Create the generation config from the given request."""
     from ..serve.config import ResponseFormat  # pylint: disable=import-outside-toplevel
+    from ..serve.config import (  # pylint: disable=import-outside-toplevel,redefined-outer-name
+        DebugConfig,
+    )
 
     kwargs: Dict[str, Any] = {}
     arg_names = [
@@ -404,7 +407,6 @@ def openai_api_get_generation_config(
         "top_logprobs",
         "logit_bias",
         "seed",
-        "debug_config",
     ]
     for arg_name in arg_names:
         kwargs[arg_name] = getattr(request, arg_name)
@@ -418,4 +420,6 @@ def openai_api_get_generation_config(
         kwargs["response_format"] = ResponseFormat(
             **request.response_format.model_dump(by_alias=True)
         )
+    if request.debug_config is not None:
+        kwargs["debug_config"] = DebugConfig(**request.debug_config.model_dump())
     return kwargs
diff --git a/python/mlc_llm/serve/config.py b/python/mlc_llm/serve/config.py
@@ -33,13 +33,17 @@ def __post_init__(self):
 class DebugConfig:
     """The debug configuration dataclass.Parameters
     ----------
+    ignore_eos : bool
+        When it is true, ignore the eos token and generate tokens until `max_tokens`.
+        Default is set to False.
 
     pinned_system_prompt : bool
         Whether the input and generated data pinned in engine. Default is set to False.
         This can be used for system prompt or other purpose, if the data is aimed to be
         kept all the time.
     """
 
+    ignore_eos: bool = False
     pinned_system_prompt: bool = False
 
 
diff --git a/python/mlc_llm/serve/server/popen_server.py b/python/mlc_llm/serve/server/popen_server.py
@@ -28,6 +28,7 @@ def __init__(  # pylint: disable=too-many-arguments
         mode: Literal["local", "interactive", "server"] = "local",
         engine_config: Optional[EngineConfig] = None,
         enable_tracing: bool = False,
+        enable_debug: bool = False,
         host: str = "127.0.0.1",
         port: int = 8000,
     ) -> None:
@@ -43,6 +44,7 @@ def __init__(  # pylint: disable=too-many-arguments
         self.mode = mode
         self.engine_config = engine_config
         self.enable_tracing = enable_tracing
+        self.enable_debug = enable_debug
         self.host = host
         self.port = port
         self._proc: Optional[subprocess.Popen] = None
@@ -96,6 +98,8 @@ def start(self) -> None:  # pylint: disable=too-many-branches,too-many-statement
 
         if self.enable_tracing:
             cmd += ["--enable-tracing"]
+        if self.enable_debug:
+            cmd += ["--enable-debug"]
 
         cmd += ["--host", self.host]
         cmd += ["--port", str(self.port)]
diff --git a/python/mlc_llm/serve/sync_engine.py b/python/mlc_llm/serve/sync_engine.py
@@ -307,7 +307,7 @@ def create_request(
         """
         if not isinstance(inputs, list):
             inputs = [inputs]
-        self._ffi["create_request"](request_id, inputs, generation_config.asjson())
+        return self._ffi["create_request"](request_id, inputs, generation_config.asjson())
 
     def add_request(self, request: Request) -> None:
         """Add a new request to the engine.
diff --git a/tests/python/serve/server/conftest.py b/tests/python/serve/server/conftest.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from mlc_llm.serve import EngineConfig, PopenServer
+from mlc_llm.serve import PopenServer
 
 
 @pytest.fixture(scope="session")
@@ -27,6 +27,7 @@ def launch_server(served_model):  # pylint: disable=redefined-outer-name
         model=served_model[0],
         model_lib=served_model[1],
         enable_tracing=True,
+        enable_debug=True,
     )
 
     with server:
diff --git a/tests/python/serve/server/test_server.py b/tests/python/serve/server/test_server.py
@@ -256,7 +256,7 @@ def test_openai_v1_completions(
         "prompt": prompt,
         "max_tokens": max_tokens,
         "stream": stream,
-        "ignore_eos": True,
+        "debug_config": {"ignore_eos": True},
     }
 
     response = requests.post(OPENAI_V1_COMPLETION_URL, json=payload, timeout=180)
@@ -347,7 +347,7 @@ def test_openai_v1_completions_echo(
         "max_tokens": max_tokens,
         "echo": True,
         "stream": stream,
-        "ignore_eos": True,
+        "debug_config": {"ignore_eos": True},
     }
 
     response = requests.post(OPENAI_V1_COMPLETION_URL, json=payload, timeout=180)
@@ -398,7 +398,7 @@ def test_openai_v1_completions_suffix(
         "max_tokens": max_tokens,
         "suffix": suffix,
         "stream": stream,
-        "ignore_eos": True,
+        "debug_config": {"ignore_eos": True},
     }
 
     response = requests.post(OPENAI_V1_COMPLETION_URL, json=payload, timeout=180)
@@ -498,7 +498,7 @@ def test_openai_v1_completions_temperature(
         "max_tokens": max_tokens,
         "stream": stream,
         "temperature": 0.0,
-        "ignore_eos": True,
+        "debug_config": {"ignore_eos": True},
     }
 
     response = requests.post(OPENAI_V1_COMPLETION_URL, json=payload, timeout=180)
@@ -652,7 +652,7 @@ def test_openai_v1_completions_logit_bias(
         "max_tokens": max_tokens,
         "stream": stream,
         "logit_bias": {338: -100},  # 338 is " is" in Llama tokenizer.
-        "ignore_eos": True,
+        "debug_config": {"ignore_eos": True},
     }
 
     response = requests.post(OPENAI_V1_COMPLETION_URL, json=payload, timeout=180)
@@ -699,7 +699,7 @@ def test_openai_v1_completions_presence_frequency_penalty(
         "stream": stream,
         "frequency_penalty": 2.0,
         "presence_penalty": 2.0,
-        "ignore_eos": True,
+        "debug_config": {"ignore_eos": True},
     }
 
     response = requests.post(OPENAI_V1_COMPLETION_URL, json=payload, timeout=180)
@@ -743,7 +743,7 @@ def test_openai_v1_completions_seed(
         "max_tokens": max_tokens,
         "stream": False,
         "seed": 233,
-        "ignore_eos": True,
+        "debug_config": {"ignore_eos": True},
     }
 
     response1 = requests.post(OPENAI_V1_COMPLETION_URL, json=payload, timeout=180)
@@ -1207,7 +1207,7 @@ def test_openai_v1_chat_completions_ignore_eos(
         "messages": messages,
         "stream": stream,
         "max_tokens": max_tokens,
-        "ignore_eos": True,
+        "debug_config": {"ignore_eos": True},
     }
 
     response = requests.post(OPENAI_V1_CHAT_COMPLETION_URL, json=payload, timeout=180)
diff --git a/tests/python/serve/test_serve_async_engine.py b/tests/python/serve/test_serve_async_engine.py
@@ -195,9 +195,9 @@ async def generate_task(prompt: str, request_id: str):
             model=model,
             max_tokens=max_tokens,
             n=n,
-            ignore_eos=True,
             request_id=request_id,
             stream=True,
+            debug_config={"ignore_eos": True},
         ):
             for choice in response.choices:
                 output_texts[rid][choice.index] += choice.text
@@ -245,8 +245,8 @@ async def generate_task(prompt: str, request_id: str):
             model=model,
             max_tokens=max_tokens,
             n=n,
-            ignore_eos=True,
             request_id=request_id,
+            debug_config={"ignore_eos": True},
         )
         for choice in response.choices:
             output_texts[rid][choice.index] += choice.text
diff --git a/tests/python/serve/test_serve_engine.py b/tests/python/serve/test_serve_engine.py
@@ -175,9 +175,9 @@ def test_completion(model: str, model_lib: str):
             model=model,
             max_tokens=max_tokens,
             n=n,
-            ignore_eos=True,
             request_id=str(rid),
             stream=True,
+            debug_config={"ignore_eos": True},
         ):
             for choice in response.choices:
                 output_texts[rid][choice.index] += choice.text
@@ -212,8 +212,8 @@ def test_completion_non_stream(model: str, model_lib: str):
             model=model,
             max_tokens=max_tokens,
             n=n,
-            ignore_eos=True,
             request_id=str(rid),
+            debug_config={"ignore_eos": True},
         )
         for choice in response.choices:
             output_texts[rid][choice.index] += choice.text