Fix tokenizer README repository ID (#1381)

H-Huang · web-flow · commit 5375abb0f012 · 2025-07-11T13:17:02.000-04:00
Fix #1333 (comment) Downloading from both `Meta-Llama-3.1-8B` and `Llama-3.1-8B` work, but the README instructions and the .toml file are different. This changes so the README matches
diff --git a/README.md b/README.md
@@ -104,7 +104,7 @@ Once you have confirmed access, you can run the following command to download th
 # Get your HF token from https://huggingface.co/settings/tokens
 
 # Llama 3.1 tokenizer
-python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B --hf_token=...
+python scripts/download_tokenizer.py --repo_id meta-llama/Llama-3.1-8B --hf_token=...
 ```
 
 ### Start a training run
diff --git a/scripts/download_tokenizer.py b/scripts/download_tokenizer.py
@@ -32,7 +32,7 @@ def download_hf_tokenizer_files(
     - special_tokens_map.json - Special token mappings
 
     Args:
-        repo_id (str): HuggingFace repository ID (e.g., "meta-llama/Meta-Llama-3.1-8B")
+        repo_id (str): HuggingFace repository ID (e.g., meta-llama/Llama-3.1-8B")
         local_dir (str): Local directory to save tokenizer files. A subdirectory
                         named after the model will be created automatically.
         hf_token (Optional[str]): HuggingFace API token for accessing private repositories.
@@ -141,7 +141,7 @@ def is_tokenizer_file(filename: str) -> bool:
         "--repo_id",
         type=str,
         required=True,
-        help="Repository ID to download from (e.g., 'meta-llama/Meta-Llama-3.1-8B', 'deepseek-ai/DeepSeek-V3')",
+        help="Repository ID to download from (e.g., 'meta-llama/Llama-3.1-8B', 'deepseek-ai/DeepSeek-V3')",
     )
     parser.add_argument(
         "--hf_token",
diff --git a/tests/unit_tests/test_tokenizer.py b/tests/unit_tests/test_tokenizer.py
@@ -244,7 +244,7 @@ def get_added_tokens_func(tokenizer):
     @parametrize(
         "test_repo_id",
         [
-            "meta-llama/Meta-Llama-3.1-8B",
+            "meta-llama/Llama-3.1-8B",
             "deepseek-ai/DeepSeek-V3",
             # "black-forest-labs/FLUX.1-dev", TODO: load the actual tokenizer
             "Qwen/Qwen2-7B",
@@ -267,9 +267,9 @@ def test_download_and_build_tokenizer(self, test_repo_id):
                 local_dir=self.temp_dir,
             )
         except HTTPError as e:
-            if test_repo_id == "meta-llama/Meta-Llama-3.1-8B":
+            if test_repo_id == "meta-llama/Llama-3.1-8B":
                 self.skipTest(
-                    f"Could not download tokenizer files for Meta-Llama-3.1-8B: {e}"
+                    f"Could not download tokenizer files for Llama-3.1-8B: {e}"
                 )
             else:
                 raise e
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -891,7 +891,7 @@ def _validate_config(self) -> None:
             if self.config.model.tokenizer_path.endswith("tokenizer.model"):
                 raise Exception(
                     "You are using the old tokenizer.model, please redownload the tokenizer ",
-                    "(python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B) ",
+                    "(python scripts/download_tokenizer.py --repo_id meta-llama/Llama-3.1-8B) ",
                     " and update your config to the directory of the downloaded tokenizer.",
                 )
 

Original file line number	Diff line number	Diff line change
`@@ -891,7 +891,7 @@ def _validate_config(self) -> None:`
`891`	`891`	`if self.config.model.tokenizer_path.endswith("tokenizer.model"):`
`892`	`892`	`raise Exception(`
`893`	`893`	`"You are using the old tokenizer.model, please redownload the tokenizer ",`
`894`		`- "(python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B) ",`
	`894`	`+ "(python scripts/download_tokenizer.py --repo_id meta-llama/Llama-3.1-8B) ",`
`895`	`895`	`" and update your config to the directory of the downloaded tokenizer.",`
`896`	`896`	`)`
`897`	`897`