Skip to content

Revert "[hf] Add new features to HF tokenizer" #90

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ jobs:
timeout: 90
script: |
set -ex
cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test
cmake --build build/test -j9 --config Debug
cd build/test && ctest
cmake -DTOKENIZERS_BUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug . -Bbuild
cmake --build build -j9 --config Debug
cd build && ctest
6 changes: 3 additions & 3 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ jobs:
timeout: 90
script: |
set -ex
cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test
cmake --build build/test -j9 --config Debug
cd build/test && ctest
cmake -DTOKENIZERS_BUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug . -Bbuild
cmake --build build -j9 --config Debug
cd build && ctest
42 changes: 2 additions & 40 deletions include/pytorch/tokenizers/pre_tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,21 +104,6 @@ class PreTokenizerConfig {
*/
CONFIG_MEMBER(bool, add_prefix_space)

/**
* Used by RegexPreTokenizer
*/
CONFIG_MEMBER(bool, is_delimiter)

/**
* Used by RegexPreTokenizer - Split behavior
*/
CONFIG_MEMBER(std::string, behavior)

/**
* Used by RegexPreTokenizer - Split invert flag
*/
CONFIG_MEMBER(bool, invert)

/**
* Used by: SequencePreTokenizer
*/
Expand Down Expand Up @@ -156,29 +141,8 @@ class PreTokenizerConfig {

class RegexPreTokenizer : public PreTokenizer {
public:
/**
* @param pattern: The regex pattern to use for token splitting
* @param is_delimiter: Whether treat `pattern` as delimiter characters, or
* use `pattern` as a regex pattern.
* @param behavior: Split behavior (only "MergedWithPrevious" supported)
* For example:
* "pre_tokenizer": {
* "type": "Split",
* "pattern": {
* "String": " "
* },
* "behavior": "MergedWithPrevious",
* "invert": false
* },
* Notice that the `invert` option is not supported.
*/
explicit RegexPreTokenizer(
const std::string& pattern,
bool is_delimiter = false,
const std::string& behavior = "")
: regex_(RegexPreTokenizer::create_regex_(pattern)),
is_delimiter_(is_delimiter),
behavior_(behavior) {}
explicit RegexPreTokenizer(const std::string& pattern)
: regex_(RegexPreTokenizer::create_regex_(pattern)) {}

/** Pre-tokenize with the stored regex */
std::vector<std::string> pre_tokenize(const std::string& input) const;
Expand All @@ -187,8 +151,6 @@ class RegexPreTokenizer : public PreTokenizer {
static std::unique_ptr<IRegex> create_regex_(const std::string& pattern);

std::unique_ptr<IRegex> regex_;
const bool is_delimiter_;
const std::string behavior_;

}; // end class RegexPreTokenizer

Expand Down
8 changes: 0 additions & 8 deletions include/pytorch/tokenizers/regex.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,6 @@ class IRegex {
* @return A vector of strings containing all matched substrings.
*/
virtual std::vector<Match> find_all(const std::string& text) const = 0;

/**
* @brief Escape special regex characters in a string to treat it as literal.
*
* @param input The input string to escape.
* @return The escaped string that can be used as a literal pattern in regex.
*/
static std::string escape(const std::string& input);
};

// Function pointer type for create_fallback_regex implementations
Expand Down
52 changes: 0 additions & 52 deletions include/pytorch/tokenizers/token_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,6 @@ class TokenDecoderConfig {
*/
std::string type;

// Parameters for Replace decoder
std::string replace_pattern;
std::string replace_content;

// Parameters for Sequence decoder
std::vector<nlohmann::json> sequence_decoders;

/*----------------*/
/* Public methods */
/*----------------*/
Expand Down Expand Up @@ -103,49 +96,4 @@ class ByteLevelTokenDecoder : public TokenDecoder {

}; // end class ByteLevelTokenDecoder

// -- Replace ------------------------------------------------------------------
// Replaces a pattern with a replacement string

class ReplaceTokenDecoder : public TokenDecoder {
public:
explicit ReplaceTokenDecoder(
const std::string& pattern,
const std::string& content);
std::string decode(const std::string& token) const override;

private:
std::string pattern_;
std::string content_;
}; // end class ReplaceTokenDecoder

// -- ByteFallback -------------------------------------------------------------
// Handles byte fallback decoding

class ByteFallbackTokenDecoder : public TokenDecoder {
public:
std::string decode(const std::string& token) const override;

}; // end class ByteFallbackTokenDecoder

// -- Fuse --------------------------------------------------------------------
// Fuses tokens together

class FuseTokenDecoder : public TokenDecoder {
public:
std::string decode(const std::string& token) const override;

}; // end class FuseTokenDecoder

// -- Sequence -----------------------------------------------------------------
// Applies a sequence of decoders in order

class SequenceTokenDecoder : public TokenDecoder {
public:
explicit SequenceTokenDecoder(std::vector<TokenDecoder::Ptr> decoders);
std::string decode(const std::string& token) const override;

private:
std::vector<TokenDecoder::Ptr> decoders_;
}; // end class SequenceTokenDecoder

} // namespace tokenizers
1 change: 0 additions & 1 deletion src/bpe_tokenizer_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,6 @@ Result<std::vector<uint64_t>> BPETokenizerBase::byte_pair_encode_(
return std::vector<uint64_t>(*result);
} else {
// TODO: is it possible?
TK_LOG(Error, "unknown token: '%s'", piece.c_str());
return Error::EncodeFailure;
}
}
Expand Down
100 changes: 3 additions & 97 deletions src/pre_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,24 +37,7 @@ PreTokenizer::Ptr PreTokenizerConfig::create() const {
throw std::runtime_error(
"Missing pattern for PreTokenizer of type Split");
}

// Validate behavior parameter
std::string behavior_str = behavior ? *behavior : "";
if (!behavior_str.empty() && behavior_str != "MergedWithPrevious") {
throw std::runtime_error(
"Unsupported behavior '" + behavior_str +
"' for Split PreTokenizer. Only 'MergedWithPrevious' is supported.");
}

// Validate invert parameter
bool invert_flag = invert ? *invert : false;
if (invert_flag) {
throw std::runtime_error(
"invert=true is not supported for Split PreTokenizer. Only invert=false is supported.");
}

return PreTokenizer::Ptr(new RegexPreTokenizer(
*pattern, is_delimiter ? *is_delimiter : false, behavior_str));
return PreTokenizer::Ptr(new RegexPreTokenizer(*pattern));
}
if (type == "Digits") {
if (individual_digits) {
Expand Down Expand Up @@ -96,27 +79,7 @@ PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) {
if (type == "Split") {
try {
pattern = json_config.at("pattern").at("Regex");
is_delimiter = false;
} catch (json::out_of_range&) {
// "Regex" is not there, check "String", which is a delimiter
std::string delimiter = json_config.at("pattern").at("String");
// For string patterns, escape regex special characters to treat them as
// literal strings (same as Rust's regex::escape)
pattern = IRegex::escape(delimiter);
is_delimiter = true;
}

// Parse behavior and invert fields
try {
behavior = json_config.at("behavior");
} catch (json::out_of_range&) {
// behavior is optional, default to empty string
}

try {
invert = json_config.at("invert");
} catch (json::out_of_range&) {
// invert is optional, default to false
}
} else if (type == "Digits") {
try {
Expand Down Expand Up @@ -152,66 +115,9 @@ std::vector<std::string> RegexPreTokenizer::pre_tokenize(
const std::string& input) const {
if (!regex_)
return {};

std::vector<std::string> results;
auto matches = regex_->find_all(input);

if (!is_delimiter_) {
// Original behavior: return the matches themselves
for (const auto& match : matches) {
results.push_back(input.substr(match.start, match.end - match.start));
}
} else {
// Delimiter behavior
if (matches.empty()) {
// No matches found, return the entire input
results.push_back(input);
return results;
}

if (behavior_ == "MergedWithPrevious") {
// MergedWithPrevious: Include delimiter with previous token
// Example: "the-final--countdown" with delimiter "-"
// -> ["the-", "final-", "-", "countdown"]
size_t last_end = 0;

for (size_t i = 0; i < matches.size(); ++i) {
const auto& match = matches[i];

// Add text before the match plus the delimiter
if (match.start > last_end) {
std::string token = input.substr(last_end, match.end - last_end);
results.push_back(token);
} else {
// Only delimiter, no preceding text
std::string delimiter =
input.substr(match.start, match.end - match.start);
results.push_back(delimiter);
}

last_end = match.end;
}

// Add remaining text after the last match (if any)
if (last_end < input.length()) {
results.push_back(input.substr(last_end));
}
} else {
// Default delimiter behavior (split on delimiters)
size_t last_end = 0;
for (const auto& match : matches) {
// Add text before the match (if any)
if (match.start > last_end) {
results.push_back(input.substr(last_end, match.start - last_end));
}
last_end = match.end;
}

// Add remaining text after the last match (if any)
if (last_end < input.length()) {
results.push_back(input.substr(last_end));
}
}
for (const auto& match : regex_->find_all(input)) {
results.push_back(input.substr(match.start, match.end - match.start));
}
return results;
}
Expand Down
17 changes: 0 additions & 17 deletions src/regex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,6 @@ FallbackRegexFn get_fallback_regex() {
return fallback_regex;
}

std::string IRegex::escape(const std::string& input) {
std::string result;
result.reserve(input.size() * 2); // Reserve space for potential escaping

for (char c : input) {
// Escape regex special characters to treat them as literal strings
if (c == '\\' || c == '^' || c == '$' || c == '.' || c == '|' || c == '?' ||
c == '*' || c == '+' || c == '(' || c == ')' || c == '[' || c == ']' ||
c == '{' || c == '}') {
result += '\\';
}
result += c;
}

return result;
}

Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
// Try RE2 first
auto re2 = std::make_unique<Re2Regex>();
Expand Down
Loading