Skip to content

Commit f6ab45f

Browse files
committed
[WIP] support different tokenizers
1 parent 722f6e2 commit f6ab45f

File tree

8 files changed

+383
-38
lines changed

8 files changed

+383
-38
lines changed

.ci/docker/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ tabulate
88
wandb
99
fsspec
1010
tyro
11+
tokenizers >= 0.15.0

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@ out
1313
wandb
1414

1515
torchtitan/datasets/**/*.model
16+
17+
# tokenizer models
1618
assets/**/*.model
19+
assets/**/*.json
1720
torchtitan/experiments/flux/assets/*
1821

1922
# temp files

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,11 @@ Once you have confirmed access, you can run the following command to download th
103103
```bash
104104
# Get your HF token from https://huggingface.co/settings/tokens
105105

106-
# Llama 3.1 tokenizer.model
107-
python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B --tokenizer_path "original" --hf_token=...
106+
# Llama 3.1 tokenizer (automatically downloads original/tokenizer.model)
107+
python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B --hf_token=...
108+
109+
# DeepSeek tokenizer (automatically downloads tokenizer.json and tokenizer_config.json)
110+
python scripts/download_tokenizer.py --repo_id deepseek-ai/DeepSeek-V3
108111
```
109112

110113
### Start a training run

scripts/download_tokenizer.py

Lines changed: 76 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,28 +4,75 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7-
from typing import Optional
7+
from typing import Optional, List
88

99
from requests.exceptions import HTTPError
1010

1111

12-
def hf_download(
13-
repo_id: str, tokenizer_path: str, local_dir: str, hf_token: Optional[str] = None
12+
def hf_download_tokenizer(
13+
repo_id: str, local_dir: str, hf_token: Optional[str] = None
1414
) -> None:
15-
from huggingface_hub import hf_hub_download
16-
17-
tokenizer_path = (
18-
f"{tokenizer_path}/tokenizer.model" if tokenizer_path else "tokenizer.model"
19-
)
20-
15+
"""
16+
Download tokenizer files from HuggingFace Hub.
17+
18+
This function attempts to download common tokenizer files that work with
19+
AutoTokenizer, including:
20+
- tokenizer.json (vocab file for modern tokenizers)
21+
- tokenizer_config.json (tokenizer configuration)
22+
- tokenizer.model (SentencePiece model for Llama-style tokenizers)
23+
24+
Args:
25+
repo_id: HuggingFace repository ID (e.g., "meta-llama/Meta-Llama-3.1-8B")
26+
local_dir: Local directory to save tokenizer files
27+
hf_token: Optional HuggingFace API token for private repos
28+
"""
29+
from huggingface_hub import hf_hub_download, list_repo_files
30+
import os
31+
32+
# Extract model name from repo_id (part after "/")
33+
model_name = repo_id.split("/")[-1]
34+
model_dir = os.path.join(local_dir, model_name)
35+
36+
# Common tokenizer files to download
37+
tokenizer_files = [
38+
"tokenizer.json",
39+
"tokenizer_config.json",
40+
"tokenizer.model",
41+
"original/tokenizer.model", # For Llama models
42+
]
43+
2144
try:
22-
hf_hub_download(
23-
repo_id=repo_id,
24-
filename=tokenizer_path,
25-
local_dir=local_dir,
26-
local_dir_use_symlinks=False,
27-
token=hf_token,
28-
)
45+
# Get list of available files in the repo
46+
available_files = list_repo_files(repo_id=repo_id, token=hf_token)
47+
48+
downloaded_files = []
49+
for filename in tokenizer_files:
50+
if filename in available_files:
51+
try:
52+
print(f"Downloading {filename}...")
53+
hf_hub_download(
54+
repo_id=repo_id,
55+
filename=filename,
56+
local_dir=model_dir,
57+
local_dir_use_symlinks=False,
58+
token=hf_token,
59+
)
60+
file_path = os.path.join(model_dir, filename)
61+
print(f"Successfully downloaded {filename} to {file_path}")
62+
downloaded_files.append(filename)
63+
except HTTPError as e:
64+
if e.response.status_code == 404:
65+
# File doesn't exist, skip it
66+
continue
67+
else:
68+
raise e
69+
70+
if not downloaded_files:
71+
print(f"Warning: No common tokenizer files found in {repo_id}")
72+
print(f"Available files: {available_files[:10]}...") # Show first 10 files
73+
else:
74+
print(f"All files downloaded to: {model_dir}")
75+
2976
except HTTPError as e:
3077
if e.response.status_code == 401:
3178
print(
@@ -38,28 +85,29 @@ def hf_download(
3885
if __name__ == "__main__":
3986
import argparse
4087

41-
parser = argparse.ArgumentParser(description="Download tokenizer from HuggingFace.")
42-
parser.add_argument(
43-
"--repo_id",
44-
type=str,
45-
default="meta-llama/Meta-Llama-3.1-8B",
46-
help="Repository ID to download from. default to Llama-3.1-8B",
88+
parser = argparse.ArgumentParser(
89+
description="Download tokenizer files from HuggingFace Hub. "
90+
"Automatically detects and downloads common tokenizer files (tokenizer.json, "
91+
"tokenizer_config.json, tokenizer.model) that work with AutoTokenizer."
4792
)
4893
parser.add_argument(
49-
"--tokenizer_path",
94+
"--repo_id",
5095
type=str,
51-
default="original",
52-
help="the tokenizer.model path relative to repo_id",
96+
required=True,
97+
help="Repository ID to download from (e.g., 'meta-llama/Meta-Llama-3.1-8B', 'deepseek-ai/DeepSeek-V3')",
5398
)
5499
parser.add_argument(
55-
"--hf_token", type=str, default=None, help="HuggingFace API token"
100+
"--hf_token",
101+
type=str,
102+
default=None,
103+
help="HuggingFace API token (required for private repos)"
56104
)
57105
parser.add_argument(
58106
"--local_dir",
59107
type=str,
60108
default="assets/tokenizer/",
61-
help="local directory to save the tokenizer.model",
109+
help="Local directory to save tokenizer files (default: assets/tokenizer/)",
62110
)
63111

64112
args = parser.parse_args()
65-
hf_download(args.repo_id, args.tokenizer_path, args.local_dir, args.hf_token)
113+
hf_download_tokenizer(args.repo_id, args.local_dir, args.hf_token)

scripts/use_tokenizer_example.py

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
"""
9+
Example script demonstrating how to load and use tokenizers downloaded
10+
by download_tokenizer.py script.
11+
12+
This script shows how to:
13+
1. Load tokenizers from downloaded tokenizer.json and tokenizer_config.json files
14+
2. Use HuggingFace tokenizers library to handle different tokenizer types
15+
3. Encode and decode text using the loaded tokenizer
16+
4. Display tokenizer information and vocabulary statistics
17+
"""
18+
19+
import argparse
20+
import json
21+
import os
22+
import sys
23+
from pathlib import Path
24+
25+
# Add the project root to Python path to ensure imports work
26+
project_root = Path(__file__).parent.parent
27+
sys.path.insert(0, str(project_root))
28+
29+
from torchtitan.components.tokenizer import load_tokenizer
30+
31+
32+
def tokenize_text(tokenizer, text: str) -> dict:
33+
"""
34+
Tokenize text and return results without printing.
35+
36+
Args:
37+
tokenizer: Loaded tokenizer
38+
text: Text to tokenize
39+
40+
Returns:
41+
dict: Dictionary containing tokenization results
42+
"""
43+
encoding = tokenizer.encode(text)
44+
return {
45+
'text': text,
46+
'token_ids': encoding.ids,
47+
'tokens': encoding.tokens,
48+
'num_tokens': len(encoding.ids),
49+
'decoded': tokenizer.decode(encoding.ids)
50+
}
51+
52+
53+
def compare_tokenizers(tokenizer1_path: str, tokenizer2_path: str):
54+
"""
55+
Compare two tokenizers on the same text.
56+
57+
Args:
58+
tokenizer1_path: Path to first tokenizer
59+
tokenizer2_path: Path to second tokenizer
60+
test_texts: List of texts to compare (uses defaults if None)
61+
"""
62+
test_texts = [
63+
"Hello, world!",
64+
"This is a test of tokenization.",
65+
"How are you doing today? 🤖",
66+
"The quick brown fox jumps over the lazy dog."
67+
]
68+
69+
try:
70+
# Load tokenizers
71+
tokenizer1, config1 = load_tokenizer(tokenizer1_path)
72+
tokenizer2, config2 = load_tokenizer(tokenizer2_path)
73+
74+
print(f"Comparing tokenizers:")
75+
print(f" Tokenizer 1: {tokenizer1_path} (vocab size: {tokenizer1.get_vocab_size():,})")
76+
print(f" Tokenizer 2: {tokenizer2_path} (vocab size: {tokenizer2.get_vocab_size():,})")
77+
print()
78+
79+
for text in test_texts:
80+
result1 = tokenize_text(tokenizer1, text)
81+
result2 = tokenize_text(tokenizer2, text)
82+
83+
print(f"Text: '{text}'")
84+
print(f" Tokenizer 1: {result1['num_tokens']} tokens → {result1['tokens']}")
85+
print(f" Tokenizer 2: {result2['num_tokens']} tokens → {result2['tokens']}")
86+
print(f" Token count difference: {result2['num_tokens'] - result1['num_tokens']}")
87+
print()
88+
89+
except Exception as e:
90+
print(f"❌ Error comparing tokenizers: {e}")
91+
92+
93+
def load_and_test_tokenizer(tokenizer_path: str, test_text: str = None) -> None:
94+
"""
95+
Load a tokenizer from the specified path and test it with sample text.
96+
97+
Args:
98+
tokenizer_path: Path to the directory containing tokenizer files
99+
test_text: Optional custom text to tokenize (uses default if not provided)
100+
"""
101+
if test_text is None:
102+
test_text = "Hello, world! This is a test of the tokenizer. How are you doing today? 🤖"
103+
104+
try:
105+
tokenizer, tokenizer_config = load_tokenizer(tokenizer_path)
106+
107+
print(f"✅ Successfully loaded tokenizer from: {tokenizer_path}")
108+
print(f"Tokenizer type: {type(tokenizer).__name__}")
109+
print(f"Vocab size: {tokenizer.get_vocab_size():,}")
110+
111+
# Display tokenizer config if available
112+
if tokenizer_config:
113+
print("Tokenizer configuration:")
114+
for key, value in tokenizer_config.items():
115+
if key in ['bos_token', 'eos_token', 'pad_token', 'unk_token']:
116+
print(f" {key}: {value}")
117+
118+
# Test tokenization
119+
result = tokenize_text(tokenizer, test_text)
120+
121+
print(f"\nTokenization results:")
122+
print(f"Input text: {result['text']}")
123+
print(f"Token IDs: {result['token_ids']}")
124+
print(f"Number of tokens: {result['num_tokens']}")
125+
print(f"Decoded text: {result['decoded']}")
126+
print(f"Individual tokens: {result['tokens']}")
127+
128+
except Exception as e:
129+
print(f"❌ Error: {e}")
130+
print("Make sure you've downloaded the tokenizer files first using download_tokenizer.py")
131+
132+
133+
def list_available_tokenizers(base_dir: str = "assets/tokenizer/") -> None:
134+
"""List all available downloaded tokenizers."""
135+
if not os.path.exists(base_dir):
136+
print(f"Tokenizer directory '{base_dir}' does not exist.")
137+
return
138+
139+
print(f"Available tokenizers in '{base_dir}':")
140+
tokenizer_dirs = []
141+
142+
for item in os.listdir(base_dir):
143+
item_path = os.path.join(base_dir, item)
144+
if os.path.isdir(item_path):
145+
# Check if it contains tokenizer files
146+
has_tokenizer_files = any(
147+
os.path.exists(os.path.join(item_path, f))
148+
for f in ["tokenizer.json", "tokenizer_config.json", "tokenizer.model"]
149+
)
150+
if has_tokenizer_files:
151+
tokenizer_dirs.append(item)
152+
153+
if tokenizer_dirs:
154+
for i, dir_name in enumerate(tokenizer_dirs, 1):
155+
print(f" {i}. {dir_name}")
156+
else:
157+
print(" No tokenizers found. Use download_tokenizer.py to download some first.")
158+
159+
160+
if __name__ == "__main__":
161+
parser = argparse.ArgumentParser(
162+
description="Load and test tokenizers downloaded by download_tokenizer.py",
163+
formatter_class=argparse.RawDescriptionHelpFormatter,
164+
epilog="""
165+
Examples:
166+
# List available tokenizers
167+
python scripts/use_tokenizer_example.py --list
168+
169+
# Test DeepSeek tokenizer
170+
python scripts/use_tokenizer_example.py --tokenizer_path assets/tokenizer/DeepSeek-V3
171+
172+
# Test Llama tokenizer
173+
python scripts/use_tokenizer_example.py --tokenizer_path assets/tokenizer/Meta-Llama-3.1-8B
174+
175+
# Test with custom text
176+
python scripts/use_tokenizer_example.py --tokenizer_path assets/tokenizer/DeepSeek-V3 --text "Your custom text here"
177+
178+
# Compare two tokenizers
179+
python scripts/use_tokenizer_example.py --compare assets/tokenizer/DeepSeek-V3 assets/tokenizer/Meta-Llama-3.1-8B
180+
"""
181+
)
182+
183+
parser.add_argument(
184+
"--tokenizer_path",
185+
type=str,
186+
help="Path to the tokenizer directory (e.g., 'assets/tokenizer/DeepSeek-V3')"
187+
)
188+
189+
parser.add_argument(
190+
"--text",
191+
type=str,
192+
help="Custom text to tokenize (optional)"
193+
)
194+
195+
parser.add_argument(
196+
"--list",
197+
action="store_true",
198+
help="List all available downloaded tokenizers"
199+
)
200+
201+
parser.add_argument(
202+
"--compare",
203+
nargs=2,
204+
metavar=("TOKENIZER1", "TOKENIZER2"),
205+
help="Compare two tokenizers (provide paths to both tokenizer directories)"
206+
)
207+
208+
parser.add_argument(
209+
"--base_dir",
210+
type=str,
211+
default="assets/tokenizer/",
212+
help="Base directory where tokenizers are stored (default: assets/tokenizer/)"
213+
)
214+
215+
args = parser.parse_args()
216+
217+
if args.list:
218+
list_available_tokenizers(args.base_dir)
219+
elif args.compare:
220+
compare_tokenizers(args.compare[0], args.compare[1])
221+
elif args.tokenizer_path:
222+
load_and_test_tokenizer(args.tokenizer_path, args.text)
223+
else:
224+
print("Please specify --tokenizer_path, --compare, or use --list to see available tokenizers.")
225+
print("Use --help for more information.")

0 commit comments

Comments
 (0)