|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# Copyright (c) Meta Platforms, Inc. and affiliates. |
| 3 | +# All rights reserved. |
| 4 | +# |
| 5 | +# This source code is licensed under the BSD-style license found in the |
| 6 | +# LICENSE file in the root directory of this source tree. |
| 7 | + |
| 8 | +""" |
| 9 | +Example script demonstrating how to load and use tokenizers downloaded |
| 10 | +by download_tokenizer.py script. |
| 11 | +
|
| 12 | +This script shows how to: |
| 13 | +1. Load tokenizers from downloaded tokenizer.json and tokenizer_config.json files |
| 14 | +2. Use HuggingFace tokenizers library to handle different tokenizer types |
| 15 | +3. Encode and decode text using the loaded tokenizer |
| 16 | +4. Display tokenizer information and vocabulary statistics |
| 17 | +""" |
| 18 | + |
| 19 | +import argparse |
| 20 | +import json |
| 21 | +import os |
| 22 | +import sys |
| 23 | +from pathlib import Path |
| 24 | + |
| 25 | +# Add the project root to Python path to ensure imports work |
| 26 | +project_root = Path(__file__).parent.parent |
| 27 | +sys.path.insert(0, str(project_root)) |
| 28 | + |
| 29 | +from torchtitan.components.tokenizer import load_tokenizer |
| 30 | + |
| 31 | + |
| 32 | +def tokenize_text(tokenizer, text: str) -> dict: |
| 33 | + """ |
| 34 | + Tokenize text and return results without printing. |
| 35 | + |
| 36 | + Args: |
| 37 | + tokenizer: Loaded tokenizer |
| 38 | + text: Text to tokenize |
| 39 | + |
| 40 | + Returns: |
| 41 | + dict: Dictionary containing tokenization results |
| 42 | + """ |
| 43 | + encoding = tokenizer.encode(text) |
| 44 | + return { |
| 45 | + 'text': text, |
| 46 | + 'token_ids': encoding.ids, |
| 47 | + 'tokens': encoding.tokens, |
| 48 | + 'num_tokens': len(encoding.ids), |
| 49 | + 'decoded': tokenizer.decode(encoding.ids) |
| 50 | + } |
| 51 | + |
| 52 | + |
| 53 | +def compare_tokenizers(tokenizer1_path: str, tokenizer2_path: str): |
| 54 | + """ |
| 55 | + Compare two tokenizers on the same text. |
| 56 | + |
| 57 | + Args: |
| 58 | + tokenizer1_path: Path to first tokenizer |
| 59 | + tokenizer2_path: Path to second tokenizer |
| 60 | + test_texts: List of texts to compare (uses defaults if None) |
| 61 | + """ |
| 62 | + test_texts = [ |
| 63 | + "Hello, world!", |
| 64 | + "This is a test of tokenization.", |
| 65 | + "How are you doing today? 🤖", |
| 66 | + "The quick brown fox jumps over the lazy dog." |
| 67 | + ] |
| 68 | + |
| 69 | + try: |
| 70 | + # Load tokenizers |
| 71 | + tokenizer1, config1 = load_tokenizer(tokenizer1_path) |
| 72 | + tokenizer2, config2 = load_tokenizer(tokenizer2_path) |
| 73 | + |
| 74 | + print(f"Comparing tokenizers:") |
| 75 | + print(f" Tokenizer 1: {tokenizer1_path} (vocab size: {tokenizer1.get_vocab_size():,})") |
| 76 | + print(f" Tokenizer 2: {tokenizer2_path} (vocab size: {tokenizer2.get_vocab_size():,})") |
| 77 | + print() |
| 78 | + |
| 79 | + for text in test_texts: |
| 80 | + result1 = tokenize_text(tokenizer1, text) |
| 81 | + result2 = tokenize_text(tokenizer2, text) |
| 82 | + |
| 83 | + print(f"Text: '{text}'") |
| 84 | + print(f" Tokenizer 1: {result1['num_tokens']} tokens → {result1['tokens']}") |
| 85 | + print(f" Tokenizer 2: {result2['num_tokens']} tokens → {result2['tokens']}") |
| 86 | + print(f" Token count difference: {result2['num_tokens'] - result1['num_tokens']}") |
| 87 | + print() |
| 88 | + |
| 89 | + except Exception as e: |
| 90 | + print(f"❌ Error comparing tokenizers: {e}") |
| 91 | + |
| 92 | + |
| 93 | +def load_and_test_tokenizer(tokenizer_path: str, test_text: str = None) -> None: |
| 94 | + """ |
| 95 | + Load a tokenizer from the specified path and test it with sample text. |
| 96 | + |
| 97 | + Args: |
| 98 | + tokenizer_path: Path to the directory containing tokenizer files |
| 99 | + test_text: Optional custom text to tokenize (uses default if not provided) |
| 100 | + """ |
| 101 | + if test_text is None: |
| 102 | + test_text = "Hello, world! This is a test of the tokenizer. How are you doing today? 🤖" |
| 103 | + |
| 104 | + try: |
| 105 | + tokenizer, tokenizer_config = load_tokenizer(tokenizer_path) |
| 106 | + |
| 107 | + print(f"✅ Successfully loaded tokenizer from: {tokenizer_path}") |
| 108 | + print(f"Tokenizer type: {type(tokenizer).__name__}") |
| 109 | + print(f"Vocab size: {tokenizer.get_vocab_size():,}") |
| 110 | + |
| 111 | + # Display tokenizer config if available |
| 112 | + if tokenizer_config: |
| 113 | + print("Tokenizer configuration:") |
| 114 | + for key, value in tokenizer_config.items(): |
| 115 | + if key in ['bos_token', 'eos_token', 'pad_token', 'unk_token']: |
| 116 | + print(f" {key}: {value}") |
| 117 | + |
| 118 | + # Test tokenization |
| 119 | + result = tokenize_text(tokenizer, test_text) |
| 120 | + |
| 121 | + print(f"\nTokenization results:") |
| 122 | + print(f"Input text: {result['text']}") |
| 123 | + print(f"Token IDs: {result['token_ids']}") |
| 124 | + print(f"Number of tokens: {result['num_tokens']}") |
| 125 | + print(f"Decoded text: {result['decoded']}") |
| 126 | + print(f"Individual tokens: {result['tokens']}") |
| 127 | + |
| 128 | + except Exception as e: |
| 129 | + print(f"❌ Error: {e}") |
| 130 | + print("Make sure you've downloaded the tokenizer files first using download_tokenizer.py") |
| 131 | + |
| 132 | + |
| 133 | +def list_available_tokenizers(base_dir: str = "assets/tokenizer/") -> None: |
| 134 | + """List all available downloaded tokenizers.""" |
| 135 | + if not os.path.exists(base_dir): |
| 136 | + print(f"Tokenizer directory '{base_dir}' does not exist.") |
| 137 | + return |
| 138 | + |
| 139 | + print(f"Available tokenizers in '{base_dir}':") |
| 140 | + tokenizer_dirs = [] |
| 141 | + |
| 142 | + for item in os.listdir(base_dir): |
| 143 | + item_path = os.path.join(base_dir, item) |
| 144 | + if os.path.isdir(item_path): |
| 145 | + # Check if it contains tokenizer files |
| 146 | + has_tokenizer_files = any( |
| 147 | + os.path.exists(os.path.join(item_path, f)) |
| 148 | + for f in ["tokenizer.json", "tokenizer_config.json", "tokenizer.model"] |
| 149 | + ) |
| 150 | + if has_tokenizer_files: |
| 151 | + tokenizer_dirs.append(item) |
| 152 | + |
| 153 | + if tokenizer_dirs: |
| 154 | + for i, dir_name in enumerate(tokenizer_dirs, 1): |
| 155 | + print(f" {i}. {dir_name}") |
| 156 | + else: |
| 157 | + print(" No tokenizers found. Use download_tokenizer.py to download some first.") |
| 158 | + |
| 159 | + |
| 160 | +if __name__ == "__main__": |
| 161 | + parser = argparse.ArgumentParser( |
| 162 | + description="Load and test tokenizers downloaded by download_tokenizer.py", |
| 163 | + formatter_class=argparse.RawDescriptionHelpFormatter, |
| 164 | + epilog=""" |
| 165 | +Examples: |
| 166 | + # List available tokenizers |
| 167 | + python scripts/use_tokenizer_example.py --list |
| 168 | +
|
| 169 | + # Test DeepSeek tokenizer |
| 170 | + python scripts/use_tokenizer_example.py --tokenizer_path assets/tokenizer/DeepSeek-V3 |
| 171 | +
|
| 172 | + # Test Llama tokenizer |
| 173 | + python scripts/use_tokenizer_example.py --tokenizer_path assets/tokenizer/Meta-Llama-3.1-8B |
| 174 | +
|
| 175 | + # Test with custom text |
| 176 | + python scripts/use_tokenizer_example.py --tokenizer_path assets/tokenizer/DeepSeek-V3 --text "Your custom text here" |
| 177 | +
|
| 178 | + # Compare two tokenizers |
| 179 | + python scripts/use_tokenizer_example.py --compare assets/tokenizer/DeepSeek-V3 assets/tokenizer/Meta-Llama-3.1-8B |
| 180 | + """ |
| 181 | + ) |
| 182 | + |
| 183 | + parser.add_argument( |
| 184 | + "--tokenizer_path", |
| 185 | + type=str, |
| 186 | + help="Path to the tokenizer directory (e.g., 'assets/tokenizer/DeepSeek-V3')" |
| 187 | + ) |
| 188 | + |
| 189 | + parser.add_argument( |
| 190 | + "--text", |
| 191 | + type=str, |
| 192 | + help="Custom text to tokenize (optional)" |
| 193 | + ) |
| 194 | + |
| 195 | + parser.add_argument( |
| 196 | + "--list", |
| 197 | + action="store_true", |
| 198 | + help="List all available downloaded tokenizers" |
| 199 | + ) |
| 200 | + |
| 201 | + parser.add_argument( |
| 202 | + "--compare", |
| 203 | + nargs=2, |
| 204 | + metavar=("TOKENIZER1", "TOKENIZER2"), |
| 205 | + help="Compare two tokenizers (provide paths to both tokenizer directories)" |
| 206 | + ) |
| 207 | + |
| 208 | + parser.add_argument( |
| 209 | + "--base_dir", |
| 210 | + type=str, |
| 211 | + default="assets/tokenizer/", |
| 212 | + help="Base directory where tokenizers are stored (default: assets/tokenizer/)" |
| 213 | + ) |
| 214 | + |
| 215 | + args = parser.parse_args() |
| 216 | + |
| 217 | + if args.list: |
| 218 | + list_available_tokenizers(args.base_dir) |
| 219 | + elif args.compare: |
| 220 | + compare_tokenizers(args.compare[0], args.compare[1]) |
| 221 | + elif args.tokenizer_path: |
| 222 | + load_and_test_tokenizer(args.tokenizer_path, args.text) |
| 223 | + else: |
| 224 | + print("Please specify --tokenizer_path, --compare, or use --list to see available tokenizers.") |
| 225 | + print("Use --help for more information.") |
0 commit comments