Fix SmolLM3 support and add unit test to cover it (#102)

larryliu0820 · web-flow · commit 20da1ca740e3 · 2025-07-16T12:48:00.000-07:00
* Fix SmolLM3 support and add unit test to cover it

* Add test for llama3.2

* Lint
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -30,7 +30,7 @@ jobs:
 
         # Install tokenizers
         pip install . -v
-        pip install pytest blobfile
+        pip install pytest blobfile transformers>=4.53.1
 
         # Run tests
         pytest
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -37,7 +37,7 @@ jobs:
 
         # Install tokenizers
         ${CONDA_RUN} pip install . -v
-        ${CONDA_RUN} pip install pytest blobfile
+        ${CONDA_RUN} pip install pytest blobfile transformers>=4.53.1
 
         # Run tests
         ${CONDA_RUN} pytest
diff --git a/src/hf_tokenizer.cpp b/src/hf_tokenizer.cpp
@@ -107,12 +107,19 @@ Error HFTokenizer::load(const std::string& path) {
   // Set up the normalizer (optional)
   try {
     TK_LOG(Info, "Setting up normalizer...");
-    _normalizer =
-        NormalizerConfig().parse_json(parsed_json.at("normalizer")).create();
-    TK_LOG(Info, "Normalizer set up");
+    const auto& normalizer_json = parsed_json.at("normalizer");
+    if (!normalizer_json.is_null()) {
+      _normalizer = NormalizerConfig().parse_json(normalizer_json).create();
+      TK_LOG(Info, "Normalizer set up");
+    } else {
+      TK_LOG(Info, "Normalizer field is null, skipping");
+    }
   } catch (const json::out_of_range& e) {
-    // No normalizer specified, this is optional
-    TK_LOG(Info, "No normalizer specified");
+    // No "Normalizer" field found
+    TK_LOG(
+        Info,
+        "No 'Normalizer' field found in json, out of range error: %s",
+        e.what());
   }
 
   // Set up the pre-tokenizer
diff --git a/test/test_hf_tokenizer.py b/test/test_hf_tokenizer.py
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# @lint-ignore-every LICENSELINT
+
+"""
+Test script for hf tokenizers.
+"""
+
+import unittest
+from pytorch_tokenizers import CppHFTokenizer
+from transformers import AutoTokenizer
+from tempfile import TemporaryDirectory
+
+PROMPT = "What is the capital of France?"
+
+class TestHfTokenizer(unittest.TestCase):
+    def setUp(self) -> None:
+        self.temp_dir = TemporaryDirectory()
+        super().setUp()
+
+    def test_smolLM3(self) -> None:
+        tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM3-3B")
+        tokenizer_path = tokenizer.save_pretrained(self.temp_dir.name)[-1]
+
+        cpp_tokenizer = CppHFTokenizer()
+        cpp_tokenizer.load(tokenizer_path)
+
+        tokens = tokenizer.encode(PROMPT)
+        cpp_tokens = cpp_tokenizer.encode(PROMPT)
+        self.assertEqual(tokens, cpp_tokens)
+
+    def test_llama3_2_1b(self) -> None:
+        tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B-Instruct")
+        tokenizer_path = tokenizer.save_pretrained(self.temp_dir.name)[-1]
+
+        cpp_tokenizer = CppHFTokenizer()
+        cpp_tokenizer.load(tokenizer_path)
+
+        tokens = tokenizer.encode(PROMPT)
+        cpp_tokens = cpp_tokenizer.encode(PROMPT, bos=1)
+        self.assertEqual(tokens, cpp_tokens)
+        
+
+    async def test_async_DO_NOT_COMMIT(self) -> None:
+        pass