pytorch
diff --git a/‎.ci/docker/requirements.txt
Lines changed: 1 addition & 0 deletions b/‎.ci/docker/requirements.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 5 additions & 2 deletions b/‎README.md
Lines changed: 5 additions & 2 deletions
diff --git a/‎scripts/download_tokenizer.py
Lines changed: 76 additions & 28 deletions b/‎scripts/download_tokenizer.py
Lines changed: 76 additions & 28 deletions
diff --git a/‎scripts/use_tokenizer_example.py
Lines changed: 225 additions & 0 deletions b/‎scripts/use_tokenizer_example.py
Lines changed: 225 additions & 0 deletions
@@ -8,3 +8,4 @@ tabulate
 wandb
 fsspec
 tyro
+tokenizers >= 0.15.0
@@ -13,7 +13,10 @@ out
 wandb
 
 torchtitan/datasets/**/*.model
+
+# tokenizer models
 assets/**/*.model
+assets/**/*.json
 torchtitan/experiments/flux/assets/*
 
 # temp files
 
@@ -103,8 +103,11 @@ Once you have confirmed access, you can run the following command to download th
 ```bash
 # Get your HF token from https://huggingface.co/settings/tokens
 
-# Llama 3.1 tokenizer.model
-python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B --tokenizer_path "original" --hf_token=...
+# Llama 3.1 tokenizer (automatically downloads original/tokenizer.model)
+python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B --hf_token=...
+
+# DeepSeek tokenizer (automatically downloads tokenizer.json and tokenizer_config.json)
+python scripts/download_tokenizer.py --repo_id deepseek-ai/DeepSeek-V3
 ```
 
 ### Start a training run
 
@@ -4,28 +4,75 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional
+from typing import Optional, List
 
 from requests.exceptions import HTTPError
 
 
-def hf_download(
-    repo_id: str, tokenizer_path: str, local_dir: str, hf_token: Optional[str] = None
+def hf_download_tokenizer(
+    repo_id: str, local_dir: str, hf_token: Optional[str] = None
 ) -> None:
-    from huggingface_hub import hf_hub_download
-
-    tokenizer_path = (
-        f"{tokenizer_path}/tokenizer.model" if tokenizer_path else "tokenizer.model"
-    )
-
+    """
+    Download tokenizer files from HuggingFace Hub.
+    
+    This function attempts to download common tokenizer files that work with
+    AutoTokenizer, including:
+    - tokenizer.json (vocab file for modern tokenizers)
+    - tokenizer_config.json (tokenizer configuration)
+    - tokenizer.model (SentencePiece model for Llama-style tokenizers)
+    
+    Args:
+        repo_id: HuggingFace repository ID (e.g., "meta-llama/Meta-Llama-3.1-8B")
+        local_dir: Local directory to save tokenizer files
+        hf_token: Optional HuggingFace API token for private repos
+    """
+    from huggingface_hub import hf_hub_download, list_repo_files
+    import os
+    
+    # Extract model name from repo_id (part after "/")
+    model_name = repo_id.split("/")[-1]
+    model_dir = os.path.join(local_dir, model_name)
+    
+    # Common tokenizer files to download
+    tokenizer_files = [
+        "tokenizer.json",
+        "tokenizer_config.json", 
+        "tokenizer.model",
+        "original/tokenizer.model",  # For Llama models
+    ]
+    
     try:
-        hf_hub_download(
-            repo_id=repo_id,
-            filename=tokenizer_path,
-            local_dir=local_dir,
-            local_dir_use_symlinks=False,
-            token=hf_token,
-        )
+        # Get list of available files in the repo
+        available_files = list_repo_files(repo_id=repo_id, token=hf_token)
+        
+        downloaded_files = []
+        for filename in tokenizer_files:
+            if filename in available_files:
+                try:
+                    print(f"Downloading {filename}...")
+                    hf_hub_download(
+                        repo_id=repo_id,
+                        filename=filename,
+                        local_dir=model_dir,
+                        local_dir_use_symlinks=False,
+                        token=hf_token,
+                    )
+                    file_path = os.path.join(model_dir, filename)
+                    print(f"Successfully downloaded {filename} to {file_path}")
+                    downloaded_files.append(filename)
+                except HTTPError as e:
+                    if e.response.status_code == 404:
+                        # File doesn't exist, skip it
+                        continue
+                    else:
+                        raise e
+        
+        if not downloaded_files:
+            print(f"Warning: No common tokenizer files found in {repo_id}")
+            print(f"Available files: {available_files[:10]}...")  # Show first 10 files
+        else:
+            print(f"All files downloaded to: {model_dir}")
+            
     except HTTPError as e:
         if e.response.status_code == 401:
             print(
@@ -38,28 +85,29 @@ def hf_download(
 if __name__ == "__main__":
     import argparse
 
-    parser = argparse.ArgumentParser(description="Download tokenizer from HuggingFace.")
-    parser.add_argument(
-        "--repo_id",
-        type=str,
-        default="meta-llama/Meta-Llama-3.1-8B",
-        help="Repository ID to download from. default to Llama-3.1-8B",
+    parser = argparse.ArgumentParser(
+        description="Download tokenizer files from HuggingFace Hub. "
+        "Automatically detects and downloads common tokenizer files (tokenizer.json, "
+        "tokenizer_config.json, tokenizer.model) that work with AutoTokenizer."
     )
     parser.add_argument(
-        "--tokenizer_path",
+        "--repo_id",
         type=str,
-        default="original",
-        help="the tokenizer.model path relative to repo_id",
+        required=True,
+        help="Repository ID to download from (e.g., 'meta-llama/Meta-Llama-3.1-8B', 'deepseek-ai/DeepSeek-V3')",
     )
     parser.add_argument(
-        "--hf_token", type=str, default=None, help="HuggingFace API token"
+        "--hf_token", 
+        type=str, 
+        default=None, 
+        help="HuggingFace API token (required for private repos)"
     )
     parser.add_argument(
         "--local_dir",
         type=str,
         default="assets/tokenizer/",
-        help="local directory to save the tokenizer.model",
+        help="Local directory to save tokenizer files (default: assets/tokenizer/)",
     )
 
     args = parser.parse_args()
-    hf_download(args.repo_id, args.tokenizer_path, args.local_dir, args.hf_token)
+    hf_download_tokenizer(args.repo_id, args.local_dir, args.hf_token)
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Example script demonstrating how to load and use tokenizers downloaded
+by download_tokenizer.py script.
+
+This script shows how to:
+1. Load tokenizers from downloaded tokenizer.json and tokenizer_config.json files
+2. Use HuggingFace tokenizers library to handle different tokenizer types
+3. Encode and decode text using the loaded tokenizer
+4. Display tokenizer information and vocabulary statistics
+"""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+# Add the project root to Python path to ensure imports work
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+
+from torchtitan.components.tokenizer import load_tokenizer
+
+
+def tokenize_text(tokenizer, text: str) -> dict:
+    """
+    Tokenize text and return results without printing.
+    
+    Args:
+        tokenizer: Loaded tokenizer
+        text: Text to tokenize
+        
+    Returns:
+        dict: Dictionary containing tokenization results
+    """
+    encoding = tokenizer.encode(text)
+    return {
+        'text': text,
+        'token_ids': encoding.ids,
+        'tokens': encoding.tokens,
+        'num_tokens': len(encoding.ids),
+        'decoded': tokenizer.decode(encoding.ids)
+    }
+
+
+def compare_tokenizers(tokenizer1_path: str, tokenizer2_path: str):
+    """
+    Compare two tokenizers on the same text.
+    
+    Args:
+        tokenizer1_path: Path to first tokenizer
+        tokenizer2_path: Path to second tokenizer
+        test_texts: List of texts to compare (uses defaults if None)
+    """
+    test_texts = [
+        "Hello, world!",
+        "This is a test of tokenization.",
+        "How are you doing today? 🤖",
+        "The quick brown fox jumps over the lazy dog."
+    ]
+    
+    try:
+        # Load tokenizers
+        tokenizer1, config1 = load_tokenizer(tokenizer1_path)
+        tokenizer2, config2 = load_tokenizer(tokenizer2_path)
+        
+        print(f"Comparing tokenizers:")
+        print(f"  Tokenizer 1: {tokenizer1_path} (vocab size: {tokenizer1.get_vocab_size():,})")
+        print(f"  Tokenizer 2: {tokenizer2_path} (vocab size: {tokenizer2.get_vocab_size():,})")
+        print()
+        
+        for text in test_texts:
+            result1 = tokenize_text(tokenizer1, text)
+            result2 = tokenize_text(tokenizer2, text)
+            
+            print(f"Text: '{text}'")
+            print(f"  Tokenizer 1: {result1['num_tokens']} tokens → {result1['tokens']}")
+            print(f"  Tokenizer 2: {result2['num_tokens']} tokens → {result2['tokens']}")
+            print(f"  Token count difference: {result2['num_tokens'] - result1['num_tokens']}")
+            print()
+            
+    except Exception as e:
+        print(f"❌ Error comparing tokenizers: {e}")
+
+
+def load_and_test_tokenizer(tokenizer_path: str, test_text: str = None) -> None:
+    """
+    Load a tokenizer from the specified path and test it with sample text.
+    
+    Args:
+        tokenizer_path: Path to the directory containing tokenizer files
+        test_text: Optional custom text to tokenize (uses default if not provided)
+    """
+    if test_text is None:
+        test_text = "Hello, world! This is a test of the tokenizer. How are you doing today? 🤖"
+    
+    try:
+        tokenizer, tokenizer_config = load_tokenizer(tokenizer_path)
+        
+        print(f"✅ Successfully loaded tokenizer from: {tokenizer_path}")
+        print(f"Tokenizer type: {type(tokenizer).__name__}")
+        print(f"Vocab size: {tokenizer.get_vocab_size():,}")
+        
+        # Display tokenizer config if available
+        if tokenizer_config:
+            print("Tokenizer configuration:")
+            for key, value in tokenizer_config.items():
+                if key in ['bos_token', 'eos_token', 'pad_token', 'unk_token']:
+                    print(f"  {key}: {value}")
+        
+        # Test tokenization
+        result = tokenize_text(tokenizer, test_text)
+        
+        print(f"\nTokenization results:")
+        print(f"Input text: {result['text']}")
+        print(f"Token IDs: {result['token_ids']}")
+        print(f"Number of tokens: {result['num_tokens']}")
+        print(f"Decoded text: {result['decoded']}")
+        print(f"Individual tokens: {result['tokens']}")
+        
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        print("Make sure you've downloaded the tokenizer files first using download_tokenizer.py")
+
+
+def list_available_tokenizers(base_dir: str = "assets/tokenizer/") -> None:
+    """List all available downloaded tokenizers."""
+    if not os.path.exists(base_dir):
+        print(f"Tokenizer directory '{base_dir}' does not exist.")
+        return
+    
+    print(f"Available tokenizers in '{base_dir}':")
+    tokenizer_dirs = []
+    
+    for item in os.listdir(base_dir):
+        item_path = os.path.join(base_dir, item)
+        if os.path.isdir(item_path):
+            # Check if it contains tokenizer files
+            has_tokenizer_files = any(
+                os.path.exists(os.path.join(item_path, f)) 
+                for f in ["tokenizer.json", "tokenizer_config.json", "tokenizer.model"]
+            )
+            if has_tokenizer_files:
+                tokenizer_dirs.append(item)
+    
+    if tokenizer_dirs:
+        for i, dir_name in enumerate(tokenizer_dirs, 1):
+            print(f"  {i}. {dir_name}")
+    else:
+        print("  No tokenizers found. Use download_tokenizer.py to download some first.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Load and test tokenizers downloaded by download_tokenizer.py",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # List available tokenizers
+    python scripts/use_tokenizer_example.py --list
+
+    # Test DeepSeek tokenizer
+    python scripts/use_tokenizer_example.py --tokenizer_path assets/tokenizer/DeepSeek-V3
+
+    # Test Llama tokenizer
+    python scripts/use_tokenizer_example.py --tokenizer_path assets/tokenizer/Meta-Llama-3.1-8B
+
+    # Test with custom text
+    python scripts/use_tokenizer_example.py --tokenizer_path assets/tokenizer/DeepSeek-V3 --text "Your custom text here"
+
+    # Compare two tokenizers
+    python scripts/use_tokenizer_example.py --compare assets/tokenizer/DeepSeek-V3 assets/tokenizer/Meta-Llama-3.1-8B
+        """
+    )
+    
+    parser.add_argument(
+        "--tokenizer_path",
+        type=str,
+        help="Path to the tokenizer directory (e.g., 'assets/tokenizer/DeepSeek-V3')"
+    )
+    
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="Custom text to tokenize (optional)"
+    )
+    
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        help="List all available downloaded tokenizers"
+    )
+    
+    parser.add_argument(
+        "--compare",
+        nargs=2,
+        metavar=("TOKENIZER1", "TOKENIZER2"),
+        help="Compare two tokenizers (provide paths to both tokenizer directories)"
+    )
+    
+    parser.add_argument(
+        "--base_dir",
+        type=str,
+        default="assets/tokenizer/",
+        help="Base directory where tokenizers are stored (default: assets/tokenizer/)"
+    )
+    
+    args = parser.parse_args()
+    
+    if args.list:
+        list_available_tokenizers(args.base_dir)
+    elif args.compare:
+        compare_tokenizers(args.compare[0], args.compare[1])
+    elif args.tokenizer_path:
+        load_and_test_tokenizer(args.tokenizer_path, args.text)
+    else:
+        print("Please specify --tokenizer_path, --compare, or use --list to see available tokenizers.")
+        print("Use --help for more information.")
-Original file line number
+Diff line change
 wandb
 fsspec
 tyro
 +tokenizers >= 0.15.0