[feat] Add Structural-Tag api to RequestResponseFormat

Irfnfnkemed · Irfnfnkemed · commit 4f980f3be8a5 · 2025-03-24T14:11:58.000+08:00
- Expose Structural-Tag api, which can be used to standarlize function calling format
- Add test script for Structural-Tag (passed on Llama-2-7b-chat-hf-q0f16-MLC and Llama-3-8B-Instruct-q4f16_1-MLC)
diff --git a/3rdparty/xgrammar b/3rdparty/xgrammar
@@ -1 +1 @@
-Subproject commit f4badadfe7363e4e09fafcde3c253a46dd5d6e97
+Subproject commit dbf200ecde5dd5467c8320076ee60b1e248b23e0
diff --git a/cpp/serve/config.cc b/cpp/serve/config.cc
@@ -10,14 +10,11 @@
 
 #include <limits>
 #include <random>
-#include <string>
-#include <vector>
 
 #include "../json_ffi/openai_api_protocol.h"
 #include "../support/json_parser.h"
 #include "../support/utils.h"
 #include "data.h"
-#include "tvm/runtime/container/array.h"
 
 namespace mlc {
 namespace llm {
@@ -1124,4 +1121,4 @@ Result<bool> ModelsUseKVCache(const std::vector<picojson::object>& model_configs
 
 }  // namespace serve
 }  // namespace llm
-}  // namespace mlc
+}  // namespace mlc
diff --git a/cpp/serve/config.h b/cpp/serve/config.h
@@ -14,7 +14,6 @@
 
 #include "../metadata/model.h"
 #include "../support/result.h"
-#include "tvm/runtime/container/optional.h"
 
 namespace mlc {
 namespace llm {
@@ -451,4 +450,4 @@ inline PrefillMode PrefillModeFromString(const std::string& prefill_mode) {
 }  // namespace llm
 }  // namespace mlc
 
-#endif  // MLC_LLM_SERVE_CONFIG_H_
+#endif  // MLC_LLM_SERVE_CONFIG_H_
diff --git a/cpp/serve/engine.cc b/cpp/serve/engine.cc
@@ -19,10 +19,8 @@
 #include <functional>
 #include <numeric>
 #include <optional>
-#include <string>
 #include <tuple>
 #include <unordered_set>
-#include <utility>
 
 #include "../support/json_parser.h"
 #include "../support/result.h"
@@ -37,7 +35,6 @@
 #include "request.h"
 #include "request_state.h"
 #include "sampler/sampler.h"
-#include "xgrammar/grammar.h"
 
 namespace mlc {
 namespace llm {
diff --git a/cpp/serve/logit_processor.cc b/cpp/serve/logit_processor.cc
@@ -12,8 +12,6 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/threading_backend.h>
 
-#include <cstdio>
-
 namespace mlc {
 namespace llm {
 namespace serve {
diff --git a/python/mlc_llm/protocol/openai_api_protocol.py b/python/mlc_llm/protocol/openai_api_protocol.py
@@ -112,6 +112,7 @@ def check_request_response_format(self) -> "RequestResponseFormat":
             raise Warning(
                 "'tags' and 'triggers' attributes should be used when type='structural_tag'"
             )
+
         return self
 
 
diff --git a/python/mlc_llm/serve/engine.py b/python/mlc_llm/serve/engine.py
@@ -976,12 +976,6 @@ async def _chat_completion(  # pylint: disable=too-many-arguments,too-many-local
         if request_id is None:
             request_id = f"chatcmpl-{engine_utils.random_uuid()}"
 
-        tools = (
-            [openai_api_protocol.ChatTool.model_validate(tool) for tool in tools]
-            if tools is not None
-            else None
-        )
-
         chatcmpl_generator = self._handle_chat_completion(
             openai_api_protocol.ChatCompletionRequest(
                 messages=[
@@ -1213,10 +1207,6 @@ async def _handle_chat_completion(
         e : BadRequestError
             BadRequestError is raised when the request is invalid.
         """
-        request.response_format = engine_base.set_structural_tag_from_tools(
-            request.tools, request.response_format
-        )
-
         (
             prompts,
             generation_cfg,
@@ -1774,10 +1764,6 @@ def _handle_chat_completion(
         e : BadRequestError
             BadRequestError is raised when the request is invalid.
         """
-        request.response_format = engine_base.set_structural_tag_from_tools(
-            request.tools, request.response_format
-        )
-
         (
             prompts,
             generation_cfg,
diff --git a/python/mlc_llm/serve/engine_base.py b/python/mlc_llm/serve/engine_base.py
@@ -7,7 +7,6 @@
 import json
 import numbers
 import queue
-import re
 import sys
 import threading
 from dataclasses import dataclass
@@ -1147,52 +1146,29 @@ def create_completion_suffix_response(
     return response
 
 
-def convert_function_str_to_json(stringified_calls: str):
+def convert_function_str_to_json(stringified_calls: str) -> List[Union[Dict, None]]:
     """Convert a (possibly list) of function call string to a list of json objects.
     Return None for invalid function call string."""
-    function_calls_json = []
-    for call in re.finditer(r"<function=(.*?)>(.*?)</function>", stringified_calls, re.DOTALL):
-        function_name = call.group(1)
-        params_str = call.group(2).strip()
-        params = ast.literal_eval(params_str)
-        function_calls_json.append({"name": function_name, "arguments": params})
-
-    return function_calls_json
 
+    def parse_function_call(call_str: str):
+        node = ast.parse(call_str, mode="eval")
+        call_node = node.body
+        if isinstance(call_node, ast.Call) and isinstance(call_node.func, ast.Name):
+            name = call_node.func.id
+            arguments = {}
+            for keyword in call_node.keywords:
+                arguments[keyword.arg] = ast.literal_eval(keyword.value)
+            return {"name": name, "arguments": arguments}
+        return None
 
-def set_structural_tag_from_tools(
-    tools: Optional[List[openai_api_protocol.ChatTool]],
-    response_format: Optional[openai_api_protocol.RequestResponseFormat],
-):
-    """Add the corresponding structural tag to the response format according to the tools to ensure valid function calling.
-    Return the updated response format.
-    """
-    if tools is None:
-        return response_format
+    if (
+        stringified_calls[0] == "[" and stringified_calls[-1] == "]"
+    ):  # hacky way to check if string list
+        calls = ast.literal_eval(stringified_calls)
     else:
-        if response_format is None or response_format.type == "text":
-            response_format = openai_api_protocol.RequestResponseFormat.model_validate(
-                {"type": "structural_tag", "tags": [], "triggers": []}
-            )
-        elif response_format.type == "json_object":
-            response_format.tags = []
-            response_format.triggers = []
-
-        response_format.triggers.append("<function=")
-        for tool in tools:
-            schema = {
-                "properties": tool.function.parameters["properties"],
-                "required": tool.function.parameters["required"],
-                "type": tool.function.parameters["type"],
-            }
-            response_format.tags.append(
-                {
-                    "begin": f"<function={tool.function.name}>",
-                    "schema": json.dumps(schema),
-                    "end": "</function>",
-                }
-            )
-        return response_format
+        calls = [stringified_calls]
+    function_calls_json = [parse_function_call(call_str) for call_str in calls]
+    return function_calls_json
 
 
 def process_function_call_output(
diff --git a/tests/python/serve/server/test_server_structural_tag.py b/tests/python/serve/server/test_server_structural_tag.py

Original file line number	Diff line number	Diff line change
`@@ -112,6 +112,7 @@ def check_request_response_format(self) -> "RequestResponseFormat":`
`112`	`112`	`raise Warning(`
`113`	`113`	`"'tags' and 'triggers' attributes should be used when type='structural_tag'"`
`114`	`114`	`)`
	`115`	`+`
`115`	`116`	`return self`
`116`	`117`
`117`	`118`