Support different tokenizers

H-Huang · H-Huang · commit a2f64fd3ccf5 · 2025-06-23T08:26:01.000-07:00
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
@@ -8,3 +8,4 @@ tabulate
 wandb
 fsspec
 tyro
+tokenizers >= 0.15.0
diff --git a/.gitignore b/.gitignore
@@ -13,7 +13,11 @@ out
 wandb
 
 torchtitan/datasets/**/*.model
+
+# tokenizer models
 assets/**/*.model
+assets/**/*.json
+assets/**/*.txt
 torchtitan/experiments/flux/assets/*
 
 # temp files
diff --git a/README.md b/README.md
@@ -103,8 +103,11 @@ Once you have confirmed access, you can run the following command to download th
 ```bash
 # Get your HF token from https://huggingface.co/settings/tokens
 
-# Llama 3.1 tokenizer.model
-python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B --tokenizer_path "original" --hf_token=...
+# Llama 3.1 tokenizer (automatically downloads original/tokenizer.model)
+python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B --hf_token=...
+
+# DeepSeek tokenizer (automatically downloads tokenizer.json and tokenizer_config.json)
+python scripts/download_tokenizer.py --repo_id deepseek-ai/DeepSeek-V3
 ```
 
 ### Start a training run
diff --git a/scripts/download_tokenizer.py b/scripts/download_tokenizer.py
@@ -9,23 +9,105 @@
 from requests.exceptions import HTTPError
 
 
-def hf_download(
-    repo_id: str, tokenizer_path: str, local_dir: str, hf_token: Optional[str] = None
+def download_hf_tokenizer_files(
+    repo_id: str, local_dir: str, hf_token: Optional[str] = None
 ) -> None:
-    from huggingface_hub import hf_hub_download
+    """
+    Download relevant tokenizer files from HuggingFace Hub repository.
 
-    tokenizer_path = (
-        f"{tokenizer_path}/tokenizer.model" if tokenizer_path else "tokenizer.model"
-    )
+    This function recursively searches through the HuggingFace Hub repository
+    and downloads all tokenizer-related files to enable tokenizer
+    loading with the build_hf_tokenizer() function.
+
+    Files downloaded:
+    - tokenizer.json - Modern HuggingFace tokenizers (complete definition)
+    - tokenizer_config.json - Tokenizer configuration and metadata
+    - tokenizer.model - SentencePiece model files (Llama, T5, etc.)
+    - vocab.txt - Plain text vocabulary files
+    - vocab.json - JSON vocabulary files
+    - merges.txt - BPE merge rules (GPT-2, RoBERTa style)
+    - special_tokens_map.json - Special token mappings
+
+    Args:
+        repo_id (str): HuggingFace repository ID (e.g., "meta-llama/Meta-Llama-3.1-8B")
+        local_dir (str): Local directory to save tokenizer files. A subdirectory
+                        named after the model will be created automatically.
+        hf_token (Optional[str]): HuggingFace API token for accessing private repositories.
+                                 Required for gated models like Llama.
+    """
+    import os
+
+    from huggingface_hub import hf_hub_download, list_repo_files
+
+    # Extract model name from repo_id (part after "/")
+    model_name = repo_id.split("/")[-1]
+    model_dir = os.path.join(local_dir, model_name)
+
+    # Tokenizer file patterns to match (case-insensitive)
+    tokenizer_patterns = [
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "tokenizer.model",
+        "vocab.txt",
+        "vocab.json",
+        "merges.txt",
+        "special_tokens_map.json",
+    ]
+
+    def is_tokenizer_file(filename: str) -> bool:
+        """Check if a file is a tokenizer-related file."""
+        filename_lower = filename.lower()
+        basename = os.path.basename(filename_lower)
+
+        # Check exact matches
+        if basename in [pattern.lower() for pattern in tokenizer_patterns]:
+            return True
+
+        return False
 
     try:
-        hf_hub_download(
-            repo_id=repo_id,
-            filename=tokenizer_path,
-            local_dir=local_dir,
-            local_dir_use_symlinks=False,
-            token=hf_token,
-        )
+        # Get list of available files in the repo
+        print(f"Scanning repository {repo_id} for tokenizer files...")
+        available_files = list_repo_files(repo_id=repo_id, token=hf_token)
+
+        # Filter for tokenizer files
+        tokenizer_files_found = [f for f in available_files if is_tokenizer_file(f)]
+
+        if not tokenizer_files_found:
+            print(f"Warning: No tokenizer files found in {repo_id}")
+            print(f"Available files: {available_files[:10]}...")
+            return
+
+        print(f"Found {len(tokenizer_files_found)} tokenizer files:")
+        for f in tokenizer_files_found:
+            print(f"  - {f}")
+
+        downloaded_files = []
+        for filename in tokenizer_files_found:
+            try:
+                hf_hub_download(
+                    repo_id=repo_id,
+                    filename=filename,
+                    local_dir=model_dir,
+                    token=hf_token,
+                )
+                file_path = os.path.join(model_dir, filename)
+                print(f"Successfully downloaded {filename} to {file_path}")
+                downloaded_files.append(filename)
+            except HTTPError as e:
+                if e.response.status_code == 404:
+                    print(f"File {filename} not found, skipping...")
+                    continue
+                else:
+                    raise e
+
+        if downloaded_files:
+            print(
+                f"\nSuccessfully downloaded {len(downloaded_files)} tokenizer files to: {model_dir}"
+            )
+        else:
+            print(f"Warning: No tokenizer files could be downloaded from {repo_id}")
+
     except HTTPError as e:
         if e.response.status_code == 401:
             print(
@@ -38,28 +120,29 @@ def hf_download(
 if __name__ == "__main__":
     import argparse
 
-    parser = argparse.ArgumentParser(description="Download tokenizer from HuggingFace.")
+    parser = argparse.ArgumentParser(
+        description="Download tokenizer files from HuggingFace Hub. "
+        "Automatically detects and downloads common tokenizer files (tokenizer.json, "
+        "tokenizer_config.json, tokenizer.model, ...) that work with Tokenizer."
+    )
     parser.add_argument(
         "--repo_id",
         type=str,
-        default="meta-llama/Meta-Llama-3.1-8B",
-        help="Repository ID to download from. default to Llama-3.1-8B",
+        required=True,
+        help="Repository ID to download from (e.g., 'meta-llama/Meta-Llama-3.1-8B', 'deepseek-ai/DeepSeek-V3')",
     )
     parser.add_argument(
-        "--tokenizer_path",
+        "--hf_token",
         type=str,
-        default="original",
-        help="the tokenizer.model path relative to repo_id",
-    )
-    parser.add_argument(
-        "--hf_token", type=str, default=None, help="HuggingFace API token"
+        default=None,
+        help="HuggingFace API token (required for private repos)",
     )
     parser.add_argument(
         "--local_dir",
         type=str,
         default="assets/tokenizer/",
-        help="local directory to save the tokenizer.model",
+        help="Local directory to save tokenizer files (default: assets/tokenizer/)",
     )
 
     args = parser.parse_args()
-    hf_download(args.repo_id, args.tokenizer_path, args.local_dir, args.hf_token)
+    download_hf_tokenizer_files(args.repo_id, args.local_dir, args.hf_token)
diff --git a/tests/unit_tests/test_tokenizer.py b/tests/unit_tests/test_tokenizer.py
@@ -0,0 +1,150 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import shutil
+import tempfile
+import unittest
+
+from scripts.download_tokenizer import download_hf_tokenizer_files
+
+from tokenizers import Tokenizer
+
+from torchtitan.components.tokenizer import build_hf_tokenizer
+
+
+class TestTokenizerIntegration(unittest.TestCase):
+    """Test integration between download_tokenizer and load_tokenizer functions."""
+
+    def setUp(self):
+        """Create a temporary directory for test files."""
+        self.temp_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        """Clean up temporary directory."""
+        shutil.rmtree(self.temp_dir)
+
+    def test_download_and_load_tokenizer_integration(self):
+        """
+        Test downloading tokenizer files and loading them, comparing with official APIs.
+
+        This test:
+        1. Downloads tokenizer files using download_hf_tokenizer_files
+        2. Loads tokenizer using our load_tokenizer function
+        3. Compares behavior with official Tokenizer library
+        4. Compares with transformers AutoTokenizer (if available)
+        """
+        # Use a smaller, accessible model for testing
+        test_repo_id = "deepseek-ai/DeepSeek-V3"
+
+        # Step 1: Download tokenizer files
+        download_hf_tokenizer_files(
+            repo_id=test_repo_id,
+            local_dir=self.temp_dir,
+            hf_token=None,  # Public model, no token needed
+        )
+
+        # Step 2: Load tokenizer using our function
+        model_name = test_repo_id.split("/")[-1]
+        tokenizer_path = os.path.join(self.temp_dir, model_name)
+        our_tokenizer = build_hf_tokenizer(tokenizer_path)
+
+        # Step 3: Load tokenizer using official Tokenizer library
+        official_tokenizer = Tokenizer.from_pretrained(test_repo_id)
+
+        # Step 4: Load tokenizer using transformers AutoTokenizer (if available)
+        transformers_tokenizer = None
+        try:
+            from transformers import AutoTokenizer
+
+            transformers_tokenizer = AutoTokenizer.from_pretrained(test_repo_id)
+        except Exception:
+            pass  # Skip transformers comparison if not available
+
+        # Step 5: Compare underlying tokenizer attributes
+        # Test that our_tokenizer.tokenizer has the same attributes as official_tokenizer
+
+        # Get the underlying tokenizer from our wrapper
+        our_underlying_tokenizer = our_tokenizer.tokenizer
+
+        # Compare key attributes that should be identical
+        # Vocabulary size
+        self.assertEqual(
+            our_underlying_tokenizer.get_vocab_size(),
+            official_tokenizer.get_vocab_size(),
+            "Vocabulary sizes should match",
+        )
+
+        # Compare vocabularies (this might be large, so we'll sample some tokens)
+        our_vocab = our_underlying_tokenizer.get_vocab()
+        official_vocab = official_tokenizer.get_vocab()
+
+        # Test a few common tokens to ensure vocabularies match
+        common_test_tokens = ["hello", "world", "the", "and", "is", "a"]
+        for token in common_test_tokens:
+            if token in our_vocab and token in official_vocab:
+                self.assertEqual(
+                    our_vocab[token],
+                    official_vocab[token],
+                    f"Token '{token}' should have the same ID in both tokenizers",
+                )
+
+        # Compare special tokens if they exist
+        # Get added tokens from both tokenizers
+        our_added_tokens = our_underlying_tokenizer.get_added_tokens_decoder()
+        official_added_tokens = official_tokenizer.get_added_tokens_decoder()
+
+        # Compare the number of added tokens
+        self.assertEqual(
+            len(our_added_tokens),
+            len(official_added_tokens),
+            "Number of added special tokens should match",
+        )
+
+        # Compare each added token
+        for token_id, our_token in our_added_tokens.items():
+            if token_id in official_added_tokens:
+                official_token = official_added_tokens[token_id]
+                self.assertEqual(
+                    our_token.content,
+                    official_token.content,
+                    f"Special token content should match for ID {token_id}",
+                )
+                # Compare token properties if they exist
+                if hasattr(our_token, "special") and hasattr(official_token, "special"):
+                    self.assertEqual(
+                        our_token.special,
+                        official_token.special,
+                        f"Special token 'special' property should match for token '{our_token.content}'",
+                    )
+
+        # Step 6: Compare with transformers tokenizer if available
+        if transformers_tokenizer:
+            # Test text encoding/decoding with transformers tokenizer
+            text = "Hello world! This is a test."
+
+            # Get tokens from our tokenizer (using the wrapper's encode method)
+            our_tokens = our_tokenizer.encode(text)
+            our_decoded_text = our_tokenizer.decode(our_tokens)
+
+            # Verify our tokenizer produces expected output
+            self.assertIsInstance(our_tokens, list)
+            self.assertEqual(our_decoded_text, text)
+
+            # Get tokens from transformers tokenizer
+            transformers_tokens = transformers_tokenizer.encode(text)
+            transformers_decoded = transformers_tokenizer.decode(transformers_tokens)
+
+            # Compare our tokens with transformers tokens
+            self.assertEqual(
+                our_tokens,
+                transformers_tokens,
+                f"Tokens should match between our tokenizer and transformers tokenizer for input: '{text}'",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchtitan/components/tokenizer.py b/torchtitan/components/tokenizer.py

-Original file line number
+Diff line change
 wandb
 fsspec
 tyro
 +tokenizers >= 0.15.0