mlc-ai
diff --git a/‎cpp/grammar/grammar_state_matcher.cc
Lines changed: 109 additions & 5 deletions b/‎cpp/grammar/grammar_state_matcher.cc
Lines changed: 109 additions & 5 deletions
diff --git a/‎cpp/grammar/grammar_state_matcher.h
Lines changed: 8 additions & 1 deletion b/‎cpp/grammar/grammar_state_matcher.h
Lines changed: 8 additions & 1 deletion
diff --git a/‎cpp/grammar/grammar_state_matcher_preproc.h
Lines changed: 1 addition & 0 deletions b/‎cpp/grammar/grammar_state_matcher_preproc.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/grammar/support.h
Lines changed: 0 additions & 90 deletions b/‎cpp/grammar/support.h
Lines changed: 0 additions & 90 deletions
diff --git a/‎cpp/serve/config.cc
Lines changed: 19 additions & 0 deletions b/‎cpp/serve/config.cc
Lines changed: 19 additions & 0 deletions
diff --git a/‎cpp/serve/config.h
Lines changed: 11 additions & 0 deletions b/‎cpp/serve/config.h
Lines changed: 11 additions & 0 deletions
diff --git a/‎cpp/serve/data.cc
Lines changed: 8 additions & 4 deletions b/‎cpp/serve/data.cc
Lines changed: 8 additions & 4 deletions
@@ -8,6 +8,7 @@
 #include <chrono>
 #include <queue>
 
+#include "../support/dynamic_bitset.h"
 #include "../tokenizers/tokenizers.h"
 #include "grammar.h"
 #include "grammar_serializer.h"
@@ -134,10 +135,12 @@ class GrammarStateMatcherNodeImpl : public GrammarStateMatcherNode, public Gramm
         max_rollback_steps_(max_rollback_steps),
         tmp_accepted_bitset_(init_ctx_->vocab_size) {}
 
-  bool AcceptToken(int32_t token_id) final;
+  bool AcceptToken(int32_t token_id, bool verbose = false) final;
 
   void FindNextTokenBitmask(DLTensor* next_token_bitmask) final;
 
+  std::string FindJumpForwardString() final;
+
   void Rollback(int num_tokens) final;
 
   int MaxRollbackSteps() const final { return max_rollback_steps_; }
@@ -193,7 +196,7 @@ bool GrammarStateMatcherNodeImpl::AcceptStopToken() {
   return true;
 }
 
-bool GrammarStateMatcherNodeImpl::AcceptToken(int32_t token_id) {
+bool GrammarStateMatcherNodeImpl::AcceptToken(int32_t token_id, bool verbose) {
   CHECK(!IsTerminated())
       << "GrammarStateMatcher has terminated after accepting the stop token, but is trying to "
          "accept another token id "
@@ -202,10 +205,20 @@ bool GrammarStateMatcherNodeImpl::AcceptToken(int32_t token_id) {
   CHECK(token_id >= 0 && token_id < init_ctx_->vocab_size)
       << "Invalid token id " << token_id << " for GrammarStateMatcher";
 
+  if (verbose) {
+    LOG(INFO) << "Accepting token id " << token_id << ", string: \""
+              << PrintAsEscaped(init_ctx_->token_table[token_id]) << "\", state state:\n"
+              << PrintStackState();
+  }
+
   // Handle the stop token
   if (std::find(init_ctx_->stop_token_ids.begin(), init_ctx_->stop_token_ids.end(), token_id) !=
       init_ctx_->stop_token_ids.end()) {
-    return AcceptStopToken();
+    bool accepted = AcceptStopToken();
+    if (verbose) {
+      LOG(INFO) << "The token is an end token. Is accepted: " << accepted;
+    }
+    return accepted;
   }
 
   if (init_ctx_->special_token_ids.count(token_id) > 0) {
@@ -215,16 +228,25 @@ bool GrammarStateMatcherNodeImpl::AcceptToken(int32_t token_id) {
   }
 
   const auto& token = init_ctx_->token_table[token_id];
+  int pos = 0;
   for (auto char_value : token) {
     if (!AcceptChar(char_value, false)) {
+      if (verbose) {
+        LOG(INFO) << "The token is rejected at position " << pos << ", character "
+                  << PrintAsEscaped(char_value);
+      }
       return false;
     }
+    ++pos;
   }
   token_length_history.push_back(token.size());
   if (token_length_history.size() > max_rollback_steps_) {
     DiscardEarliestChars(token_length_history.front());
     token_length_history.pop_front();
   }
+  if (verbose) {
+    LOG(INFO) << "The token is accepted. State after accepting:\n" << PrintStackState();
+  }
   return true;
 }
 
@@ -342,6 +364,85 @@ void GrammarStateMatcherNodeImpl::FindNextTokenBitmask(DLTensor* next_token_bitm
   SetTokenBitmask(next_token_bitmask, tmp_accepted_bitset_, tmp_rejected_indices_, can_reach_end);
 }
 
+std::string GrammarStateMatcherNodeImpl::FindJumpForwardString() {
+  CHECK(!IsTerminated())
+      << "GrammarStateMatcher has terminated after accepting the stop token, but is trying to "
+         "get the jump forward string";
+
+  std::string result;
+  int num_accepted_chars = 0;
+  bool can_find_next_char = true;
+
+  while (can_find_next_char) {
+    const auto& stack_tops = stack_tops_history_.GetLatest();
+
+    // 1. Check that for every stack top, the next possible char is unique and the same
+    // -1 means not found yet; 0~255 means the next char
+    int next_char = -1;
+    for (auto stack_top : stack_tops) {
+      auto rule_position = tree_[stack_top];
+      auto cur_sequence = grammar_->GetRuleExpr(rule_position.sequence_id);
+      if (rule_position.parent_id == RulePosition::kNoParent &&
+          rule_position.element_id == cur_sequence.size()) {
+        can_find_next_char = false;
+        break;
+      }
+
+      auto cur_element = grammar_->GetRuleExpr(cur_sequence[rule_position.element_id]);
+
+      if (cur_element.type == RuleExprType::kByteString) {
+        DCHECK(rule_position.element_in_string < cur_element.size());
+        if (next_char == -1) {
+          next_char = cur_element[rule_position.element_in_string];
+        } else if (next_char != cur_element[rule_position.element_in_string]) {
+          can_find_next_char = false;
+          break;
+        }
+      } else {
+        DCHECK(cur_element.type == RuleExprType::kCharacterClass ||
+               cur_element.type == RuleExprType::kCharacterClassStar);
+        if (rule_position.left_utf8_bytes > 0 || cur_element.size() != 3 || cur_element[0] != 0 ||
+            cur_element[1] != cur_element[2]) {
+          can_find_next_char = false;
+          break;
+        } else if (next_char == -1) {
+          next_char = cur_element[1];
+        } else if (next_char != cur_element[1]) {
+          can_find_next_char = false;
+          break;
+        }
+      }
+    }
+
+    if (next_char == -1) {
+      can_find_next_char = false;
+    }
+
+    // 2. If found, accept the char and iterate to the next position
+    if (can_find_next_char) {
+      result += static_cast<uint8_t>(next_char);
+
+      tmp_new_stack_tops_.clear();
+      for (auto stack_top : stack_tops) {
+        auto cur_rule_position = tree_[stack_top];
+        auto new_rule_position = UpdatePositionWithChar(cur_rule_position, next_char);
+
+        if (new_rule_position == cur_rule_position) {
+          ExpandRulePosition(new_rule_position, &tmp_new_stack_tops_, true, stack_top);
+        } else {
+          ExpandRulePosition(new_rule_position, &tmp_new_stack_tops_, true);
+        }
+      }
+      stack_tops_history_.PushHistory(tmp_new_stack_tops_);
+      ++num_accepted_chars;
+    }
+  }
+
+  // Rollback all chars accepted
+  RollbackChars(num_accepted_chars);
+  return result;
+}
+
 void GrammarStateMatcherNodeImpl::Rollback(int num_tokens) {
   CHECK(num_tokens <= token_length_history.size())
       << "Intended to rollback " << num_tokens << " tokens, but only the last "
@@ -477,10 +578,13 @@ TVM_REGISTER_GLOBAL("mlc.grammar.GrammarStateMatcherDebugAcceptChar")
     });
 
 TVM_REGISTER_GLOBAL("mlc.grammar.GrammarStateMatcherAcceptToken")
-    .set_body_typed([](GrammarStateMatcher matcher, int32_t token_id) {
-      return matcher->AcceptToken(token_id);
+    .set_body_typed([](GrammarStateMatcher matcher, int32_t token_id, bool verbose) {
+      return matcher->AcceptToken(token_id, verbose);
     });
 
+TVM_REGISTER_GLOBAL("mlc.grammar.GrammarStateMatcherFindJumpForwardString")
+    .set_body_typed([](GrammarStateMatcher matcher) { return matcher->FindJumpForwardString(); });
+
 TVM_REGISTER_GLOBAL("mlc.grammar.GrammarStateMatcherRollback")
     .set_body_typed([](GrammarStateMatcher matcher, int num_tokens) {
       matcher->Rollback(num_tokens);
 
@@ -65,7 +65,7 @@ class GrammarStateMatcherNode : public Object {
    * FindNextTokenMask operations can be performed. The termination state can be canceled
    * using Rollback().
    */
-  virtual bool AcceptToken(int32_t token_id) = 0;
+  virtual bool AcceptToken(int32_t token_id, bool verbose = false) = 0;
 
   /*!
    * \brief Find the set of tokens that are acceptable for the next step and store them in a
@@ -75,6 +75,13 @@ class GrammarStateMatcherNode : public Object {
    */
   virtual void FindNextTokenBitmask(DLTensor* next_token_bitmask) = 0;
 
+  /*!
+   * \brief Find the jump-forward string for jump-forward decoding. This is the longest string that
+   will be valid according to the current syntax.
+   * \note This method does not change the grammar state.
+   */
+  virtual std::string FindJumpForwardString() = 0;
+
   /*!
    * \brief Rollback the matcher to a previous state.
    * \param num_tokens The number of tokens to rollback. It cannot exceed the current number of
 
@@ -8,6 +8,7 @@
 
 #include <vector>
 
+#include "../support/dynamic_bitset.h"
 #include "../support/encoding.h"
 #include "../support/utils.h"
 #include "grammar.h"
 
@@ -17,96 +17,6 @@ namespace mlc {
 namespace llm {
 namespace serve {
 
-/*! \brief A bitset with runtime specified length. It manages memory internally or the memory
- * provided externally with enough size. */
-class DynamicBitset {
- public:
-  static int CalculateBufferSize(int element_size) { return (element_size + 31) / 32; }
-
-  DynamicBitset() : size_(0), buffer_size_(0), data_(nullptr), is_internal_(true) {}
-
-  DynamicBitset(int size, uint32_t* data = nullptr)
-      : size_(size), buffer_size_(CalculateBufferSize(size)) {
-    if (data == nullptr) {
-      internal_buffer_.resize(buffer_size_, 0);
-      data_ = internal_buffer_.data();
-      is_internal_ = true;
-    } else {
-      data_ = data;
-      is_internal_ = false;
-    }
-  }
-
-  DynamicBitset& operator=(const DynamicBitset& other) {
-    DCHECK(is_internal_ || size_ >= other.size_) << "Expanding bitset size is not allowed when the "
-                                                    "memory of the bitset is externally managed";
-    size_ = other.size_;
-    buffer_size_ = other.buffer_size_;
-    if (is_internal_) {
-      internal_buffer_.reserve(buffer_size_);
-      data_ = internal_buffer_.data();
-    }
-    if (data_ != other.data_) {
-      std::memcpy(data_, other.data_, buffer_size_ * sizeof(uint32_t));
-    }
-    return *this;
-  }
-
-  DynamicBitset& operator=(DynamicBitset&& other) {
-    size_ = other.size_;
-    buffer_size_ = other.buffer_size_;
-    is_internal_ = other.is_internal_;
-    if (is_internal_) {
-      internal_buffer_ = std::move(other.internal_buffer_);
-      data_ = internal_buffer_.data();
-    } else {
-      data_ = other.data_;
-    }
-    return *this;
-  }
-
-  bool operator[](int index) const {
-    DCHECK(data_ && index >= 0 && index < size_);
-    return (data_[index / 32] >> (index % 32)) & 1;
-  }
-
-  int Size() const { return size_; }
-
-  void Set(int index, bool value) {
-    DCHECK(data_ && index >= 0 && index < size_);
-    if (value) {
-      data_[index / 32] |= 1 << (index % 32);
-    } else {
-      data_[index / 32] &= ~(1 << (index % 32));
-    }
-  }
-
-  void Set() {
-    DCHECK(data_);
-    std::memset(data_, 0xFF, buffer_size_ * sizeof(uint32_t));
-  }
-
-  void Reset() {
-    DCHECK(data_);
-    std::memset(data_, 0, buffer_size_ * sizeof(uint32_t));
-  }
-
-  DynamicBitset& operator|=(const DynamicBitset& other) {
-    DCHECK(buffer_size_ <= other.buffer_size_);
-    for (int i = 0; i < buffer_size_; ++i) {
-      data_[i] |= other.data_[i];
-    }
-    return *this;
-  }
-
- private:
-  int size_;
-  int buffer_size_;
-  uint32_t* data_;
-  std::vector<uint32_t> internal_buffer_;
-  bool is_internal_;
-};
-
 /*!
  * \brief Let lhs be the union of lhs and rhs. Suppose that both sets are sorted.
  * \note No additional vectors are allocated, and the time complexity is O(n)
 
@@ -77,6 +77,15 @@ Result<DebugConfig> DebugConfig::FromJSON(const picojson::object& config) {
       return TResult::Error("Uknown special request " + special_request);
     }
   }
+  std::string grammar_execution_mode =
+      json::LookupOrDefault<std::string>(config, "grammar_execution_mode", "jump_forward");
+  if (grammar_execution_mode == "jump_forward") {
+    res.grammar_execution_mode = GrammarExecutionMode::kJumpForward;
+  } else if (grammar_execution_mode == "constraint") {
+    res.grammar_execution_mode = GrammarExecutionMode::kConstraint;
+  } else {
+    return TResult::Error("Uknown grammar execution mode " + grammar_execution_mode);
+  }
   return TResult::Ok(res);
 }
 
@@ -95,6 +104,16 @@ picojson::object DebugConfig::AsJSON() const {
     case SpecialRequestKind::kNone:
       break;
   }
+  switch (grammar_execution_mode) {
+    case GrammarExecutionMode::kJumpForward: {
+      config["grammar_execution_mode"] = picojson::value("jump_forward");
+      break;
+    }
+    case GrammarExecutionMode::kConstraint: {
+      config["grammar_execution_mode"] = picojson::value("constraint");
+      break;
+    }
+  }
   return config;
 }
 
 
@@ -46,12 +46,23 @@ enum class SpecialRequestKind : int {
   kQueryEngineMetrics = 1,
 };
 
+/*! \brief Controls the behavior of inference with grammar constraint. */
+enum class GrammarExecutionMode : int {
+  /*! \brief If grammar is provided for a request, use the grammar to constrain the output token. */
+  kConstraint = 0,
+  /*! \brief If grammar is provided for a request, not only constrain the output, but also use the
+   * jump-forward decoding to predict the next tokens. This is the default option. */
+  kJumpForward = 1,
+};
+
 /*! \brief The debug configuration of a request. */
 class DebugConfig {
  public:
   bool ignore_eos = false;
   bool pinned_system_prompt = false;
   SpecialRequestKind special_request = SpecialRequestKind::kNone;
+  /*! \brief The grammar execution mode. */
+  GrammarExecutionMode grammar_execution_mode = GrammarExecutionMode::kJumpForward;
 
   /*!
    * \brief Create debug config from JSON.
 
@@ -173,12 +173,13 @@ TVM_REGISTER_OBJECT_TYPE(RequestStreamOutputObj);
 RequestStreamOutput::RequestStreamOutput(
     String request_id, Array<IntTuple> group_delta_token_ids,
     Optional<Array<Array<String>>> group_delta_logprob_json_strs,
-    Array<Optional<String>> group_finish_reason) {
+    Array<Optional<String>> group_finish_reason, Array<String> group_extra_prefix_string) {
   ObjectPtr<RequestStreamOutputObj> n = make_object<RequestStreamOutputObj>();
   n->request_id = std::move(request_id);
   n->group_delta_token_ids = std::move(group_delta_token_ids);
   n->group_delta_logprob_json_strs = std::move(group_delta_logprob_json_strs);
   n->group_finish_reason = std::move(group_finish_reason);
+  n->group_extra_prefix_string = std::move(group_extra_prefix_string);
   data_ = std::move(n);
 }
 
@@ -192,9 +193,12 @@ RequestStreamOutput RequestStreamOutput::Usage(String request_id,
 
 TVM_REGISTER_GLOBAL("mlc.serve.RequestStreamOutputUnpack")
     .set_body_typed([](RequestStreamOutput output) {
-      return Array<ObjectRef>{output->request_id, output->group_delta_token_ids,
-                              output->group_delta_logprob_json_strs, output->group_finish_reason,
-                              output->request_final_usage_json_str};
+      return Array<ObjectRef>{output->request_id,
+                              output->group_delta_token_ids,
+                              output->group_delta_logprob_json_strs,
+                              output->group_finish_reason,
+                              output->request_final_usage_json_str,
+                              output->group_extra_prefix_string};
     });
 
 }  // namespace serve