pytorch · H-Huang · Jul 10, 2025 · Jun 24, 2025
@@ -3,3 +3,4 @@ pytest==7.3.2
 pytest-cov
 pre-commit
 tomli-w >= 1.1.0
+transformers
@@ -2,8 +2,6 @@ torchdata >= 0.8.0
 datasets >= 3.6.0
 tomli >= 1.1.0 ; python_version < "3.11"
 tensorboard
-tiktoken
-blobfile
 tabulate
 wandb
 fsspec

@@ -14,7 +14,7 @@ We actively welcome your pull requests.
 2. If you've added code that should be tested, add tests.
 3. If you've changed APIs, update the documentation.
 4. Ensure the test suite passes.
-5. Make sure your code lints (`pre-commit run --all-files`).
+5. Make sure your code lints (`pre-commit run --files $(git diff --name-only HEAD~1)`).
 6. If you haven't already, complete the Contributor License Agreement ("CLA").
 
 ### Contributor License Agreement ("CLA")

@@ -17,8 +17,7 @@ dependencies = [
     "datasets>=2.21.0",
 
     # Tokenization
-    "blobfile",
-    "tiktoken",
+    "tokenizers",
 
     # Miscellaneous
     "tomli>=1.1.0",

@@ -108,7 +108,7 @@ def is_tokenizer_file(filename: str) -> bool:
                 print(f"Successfully downloaded {filename} to {file_path}")
                 downloaded_files.append(filename)
             except HTTPError as e:
-                if e.response.status_code == 404:
+                if e.response and e.response.status_code == 404:
                     print(f"File {filename} not found, skipping...")
                     continue
                 else:
@@ -122,7 +122,7 @@ def is_tokenizer_file(filename: str) -> bool:
             print(f"Warning: No tokenizer files could be downloaded from {repo_id}")
 
     except HTTPError as e:
-        if e.response.status_code == 401:
+        if e.response and e.response.status_code == 401:
             print(
                 "You need to pass a valid `--hf_token=...` to download private checkpoints."
             )

@@ -165,7 +165,7 @@ def test_generate(
     input_ids = (
         (
             torch.tensor(
-                tokenizer.encode(prompt, bos=True, eos=False), dtype=torch.long
+                tokenizer.encode(prompt, add_bos=True, add_eos=False), dtype=torch.long
             )
             .view(1, -1)
             .repeat(batch_size, 1)