pytorch
diff --git a/‎.ci/docker/requirements-dev.txt
Lines changed: 1 addition & 0 deletions b/‎.ci/docker/requirements-dev.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/docker/requirements.txt
Lines changed: 0 additions & 2 deletions b/‎.ci/docker/requirements.txt
Lines changed: 0 additions & 2 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 2 deletions b/‎pyproject.toml
Lines changed: 1 addition & 2 deletions
diff --git a/‎scripts/download_tokenizer.py
Lines changed: 2 additions & 2 deletions b/‎scripts/download_tokenizer.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/generate/test_generate.py
Lines changed: 1 addition & 1 deletion b/‎scripts/generate/test_generate.py
Lines changed: 1 addition & 1 deletion
@@ -3,3 +3,4 @@ pytest==7.3.2
 pytest-cov
 pre-commit
 tomli-w >= 1.1.0
+transformers
@@ -2,8 +2,6 @@ torchdata >= 0.8.0
 datasets >= 3.6.0
 tomli >= 1.1.0 ; python_version < "3.11"
 tensorboard
-tiktoken
-blobfile
 tabulate
 wandb
 fsspec
 
@@ -14,7 +14,7 @@ We actively welcome your pull requests.
 2. If you've added code that should be tested, add tests.
 3. If you've changed APIs, update the documentation.
 4. Ensure the test suite passes.
-5. Make sure your code lints (`pre-commit run --all-files`).
+5. Make sure your code lints (`pre-commit run --files $(git diff --name-only HEAD~1)`).
 6. If you haven't already, complete the Contributor License Agreement ("CLA").
 
 ### Contributor License Agreement ("CLA")
 
@@ -17,8 +17,7 @@ dependencies = [
     "datasets>=2.21.0",
 
     # Tokenization
-    "blobfile",
-    "tiktoken",
+    "tokenizers",
 
     # Miscellaneous
     "tomli>=1.1.0",
 
@@ -108,7 +108,7 @@ def is_tokenizer_file(filename: str) -> bool:
                 print(f"Successfully downloaded {filename} to {file_path}")
                 downloaded_files.append(filename)
             except HTTPError as e:
-                if e.response.status_code == 404:
+                if e.response and e.response.status_code == 404:
                     print(f"File {filename} not found, skipping...")
                     continue
                 else:
@@ -122,7 +122,7 @@ def is_tokenizer_file(filename: str) -> bool:
             print(f"Warning: No tokenizer files could be downloaded from {repo_id}")
 
     except HTTPError as e:
-        if e.response.status_code == 401:
+        if e.response and e.response.status_code == 401:
             print(
                 "You need to pass a valid `--hf_token=...` to download private checkpoints."
             )
 
@@ -165,7 +165,7 @@ def test_generate(
     input_ids = (
         (
             torch.tensor(
-                tokenizer.encode(prompt, bos=True, eos=False), dtype=torch.long
+                tokenizer.encode(prompt, add_bos=True, add_eos=False), dtype=torch.long
             )
             .view(1, -1)
             .repeat(batch_size, 1)
Original file line number	Diff line number	Diff line change
`@@ -165,7 +165,7 @@ def test_generate(`
`165`	`165`	`input_ids = (`
`166`	`166`	`(`
`167`	`167`	`torch.tensor(`
`168`		`- tokenizer.encode(prompt, bos=True, eos=False), dtype=torch.long`
	`168`	`+ tokenizer.encode(prompt, add_bos=True, add_eos=False), dtype=torch.long`
`169`	`169`	`)`
`170`	`170`	`.view(1, -1)`
`171`	`171`	`.repeat(batch_size, 1)`