diff --git a/README.md b/README.md index a194bcb7d..8c81fa500 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ Once you have confirmed access, you can run the following command to download th # Get your HF token from https://huggingface.co/settings/tokens # Llama 3.1 tokenizer -python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B --hf_token=... +python scripts/download_tokenizer.py --repo_id meta-llama/Llama-3.1-8B --hf_token=... ``` ### Start a training run diff --git a/scripts/download_tokenizer.py b/scripts/download_tokenizer.py index 664bd369b..3996ac29a 100644 --- a/scripts/download_tokenizer.py +++ b/scripts/download_tokenizer.py @@ -32,7 +32,7 @@ def download_hf_tokenizer_files( - special_tokens_map.json - Special token mappings Args: - repo_id (str): HuggingFace repository ID (e.g., "meta-llama/Meta-Llama-3.1-8B") + repo_id (str): HuggingFace repository ID (e.g., meta-llama/Llama-3.1-8B") local_dir (str): Local directory to save tokenizer files. A subdirectory named after the model will be created automatically. hf_token (Optional[str]): HuggingFace API token for accessing private repositories. @@ -141,7 +141,7 @@ def is_tokenizer_file(filename: str) -> bool: "--repo_id", type=str, required=True, - help="Repository ID to download from (e.g., 'meta-llama/Meta-Llama-3.1-8B', 'deepseek-ai/DeepSeek-V3')", + help="Repository ID to download from (e.g., 'meta-llama/Llama-3.1-8B', 'deepseek-ai/DeepSeek-V3')", ) parser.add_argument( "--hf_token", diff --git a/tests/unit_tests/test_tokenizer.py b/tests/unit_tests/test_tokenizer.py index 72fa28a46..a7a3a7e62 100644 --- a/tests/unit_tests/test_tokenizer.py +++ b/tests/unit_tests/test_tokenizer.py @@ -244,7 +244,7 @@ def get_added_tokens_func(tokenizer): @parametrize( "test_repo_id", [ - "meta-llama/Meta-Llama-3.1-8B", + "meta-llama/Llama-3.1-8B", "deepseek-ai/DeepSeek-V3", # "black-forest-labs/FLUX.1-dev", TODO: load the actual tokenizer "Qwen/Qwen2-7B", @@ -267,9 +267,9 @@ def test_download_and_build_tokenizer(self, test_repo_id): local_dir=self.temp_dir, ) except HTTPError as e: - if test_repo_id == "meta-llama/Meta-Llama-3.1-8B": + if test_repo_id == "meta-llama/Llama-3.1-8B": self.skipTest( - f"Could not download tokenizer files for Meta-Llama-3.1-8B: {e}" + f"Could not download tokenizer files for Llama-3.1-8B: {e}" ) else: raise e diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py index 8a2ebe434..5f1a1e8b7 100644 --- a/torchtitan/config_manager.py +++ b/torchtitan/config_manager.py @@ -891,7 +891,7 @@ def _validate_config(self) -> None: if self.config.model.tokenizer_path.endswith("tokenizer.model"): raise Exception( "You are using the old tokenizer.model, please redownload the tokenizer ", - "(python scripts/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3.1-8B) ", + "(python scripts/download_tokenizer.py --repo_id meta-llama/Llama-3.1-8B) ", " and update your config to the directory of the downloaded tokenizer.", )