Skip to content

Commit 5375abb

Browse files
authored
Fix tokenizer README repository ID (#1381)
Fix #1333 (comment) Downloading from both `Meta-Llama-3.1-8B` and `Llama-3.1-8B` work, but the README instructions and the .toml file are different. This changes so the README matches
1 parent d54d05a commit 5375abb

File tree

4 files changed

+7
-7
lines changed

4 files changed

+7
-7
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ Once you have confirmed access, you can run the following command to download th
104104
# Get your HF token from https://huggingface.co/settings/tokens
105105

106106
# Llama 3.1 tokenizer
107-
python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B --hf_token=...
107+
python scripts/download_tokenizer.py --repo_id meta-llama/Llama-3.1-8B --hf_token=...
108108
```
109109

110110
### Start a training run

scripts/download_tokenizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def download_hf_tokenizer_files(
3232
- special_tokens_map.json - Special token mappings
3333
3434
Args:
35-
repo_id (str): HuggingFace repository ID (e.g., "meta-llama/Meta-Llama-3.1-8B")
35+
repo_id (str): HuggingFace repository ID (e.g., meta-llama/Llama-3.1-8B")
3636
local_dir (str): Local directory to save tokenizer files. A subdirectory
3737
named after the model will be created automatically.
3838
hf_token (Optional[str]): HuggingFace API token for accessing private repositories.
@@ -141,7 +141,7 @@ def is_tokenizer_file(filename: str) -> bool:
141141
"--repo_id",
142142
type=str,
143143
required=True,
144-
help="Repository ID to download from (e.g., 'meta-llama/Meta-Llama-3.1-8B', 'deepseek-ai/DeepSeek-V3')",
144+
help="Repository ID to download from (e.g., 'meta-llama/Llama-3.1-8B', 'deepseek-ai/DeepSeek-V3')",
145145
)
146146
parser.add_argument(
147147
"--hf_token",

tests/unit_tests/test_tokenizer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ def get_added_tokens_func(tokenizer):
244244
@parametrize(
245245
"test_repo_id",
246246
[
247-
"meta-llama/Meta-Llama-3.1-8B",
247+
"meta-llama/Llama-3.1-8B",
248248
"deepseek-ai/DeepSeek-V3",
249249
# "black-forest-labs/FLUX.1-dev", TODO: load the actual tokenizer
250250
"Qwen/Qwen2-7B",
@@ -267,9 +267,9 @@ def test_download_and_build_tokenizer(self, test_repo_id):
267267
local_dir=self.temp_dir,
268268
)
269269
except HTTPError as e:
270-
if test_repo_id == "meta-llama/Meta-Llama-3.1-8B":
270+
if test_repo_id == "meta-llama/Llama-3.1-8B":
271271
self.skipTest(
272-
f"Could not download tokenizer files for Meta-Llama-3.1-8B: {e}"
272+
f"Could not download tokenizer files for Llama-3.1-8B: {e}"
273273
)
274274
else:
275275
raise e

torchtitan/config_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -891,7 +891,7 @@ def _validate_config(self) -> None:
891891
if self.config.model.tokenizer_path.endswith("tokenizer.model"):
892892
raise Exception(
893893
"You are using the old tokenizer.model, please redownload the tokenizer ",
894-
"(python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B) ",
894+
"(python scripts/download_tokenizer.py --repo_id meta-llama/Llama-3.1-8B) ",
895895
" and update your config to the directory of the downloaded tokenizer.",
896896
)
897897

0 commit comments

Comments
 (0)