File tree Expand file tree Collapse file tree 3 files changed +10
-12
lines changed Expand file tree Collapse file tree 3 files changed +10
-12
lines changed Original file line number Diff line number Diff line change 19
19
parametrize ,
20
20
)
21
21
22
- from torchtitan .components .tokenizer import build_hf_tokenizer
22
+ from torchtitan .components .tokenizer import HuggingFaceTokenizer
23
23
24
24
25
25
class TestTokenizerIntegration (unittest .TestCase ):
@@ -278,7 +278,7 @@ def test_download_and_build_tokenizer(self, test_repo_id):
278
278
model_name = test_repo_id .split ("/" )[- 1 ]
279
279
tokenizer_dir = "tokenizer" if model_name == "FLUX.1-dev" else "."
280
280
tokenizer_path = os .path .join (self .temp_dir , model_name , tokenizer_dir )
281
- our_tokenizer = build_hf_tokenizer (tokenizer_path )
281
+ our_tokenizer = HuggingFaceTokenizer (tokenizer_path )
282
282
283
283
# Step 3: Load tokenizer using official Tokenizer library (if available)
284
284
official_tokenizer = None
Original file line number Diff line number Diff line change 8
8
import json
9
9
import os
10
10
from abc import ABC , abstractmethod
11
- from typing import Any , Optional
11
+ from typing import Any , Optional , Union
12
12
13
13
from tokenizers import AddedToken , Tokenizer as HfTokenizer
14
-
14
+ from torchtitan . config_manager import JobConfig
15
15
from typing_extensions import override
16
16
17
17
@@ -407,20 +407,18 @@ def id_to_token(self, token_id: int) -> Optional[str]:
407
407
return self .tokenizer .id_to_token (token_id )
408
408
409
409
410
- def build_hf_tokenizer (tokenizer_path : str ) -> HuggingFaceTokenizer :
410
+ def build_hf_tokenizer (
411
+ job_config : JobConfig ,
412
+ ) -> Union [HuggingFaceTokenizer , Tokenizer ]:
411
413
"""
412
414
Builds a HuggingFaceTokenizer from the specified path.
413
-
414
415
This function creates a HuggingFaceTokenizer instance that handles BOS/EOS token
415
416
inference and intelligent encoding. The tokenizer automatically detects and loads
416
417
from various file formats and infers special token behavior.
417
-
418
418
Args:
419
- tokenizer_path (str): Path to the directory containing tokenizer files.
420
- Should contain one or more of the supported file types.
421
-
419
+ JobConfig: A JobConfig object containing the path to the tokenizer directory.
422
420
Returns:
423
421
tokenizer (HuggingFaceTokenizer): Loaded tokenizer instance with intelligent BOS/EOS handling
424
422
"""
425
- tokenizer = HuggingFaceTokenizer (tokenizer_path )
423
+ tokenizer = HuggingFaceTokenizer (job_config . model . tokenizer_path )
426
424
return tokenizer
Original file line number Diff line number Diff line change @@ -126,7 +126,7 @@ def __init__(self, job_config: JobConfig):
126
126
127
127
# build dataloader
128
128
tokenizer = (
129
- self .train_spec .build_tokenizer_fn (job_config . model . tokenizer_path )
129
+ self .train_spec .build_tokenizer_fn (job_config )
130
130
if self .train_spec .build_tokenizer_fn is not None
131
131
else None
132
132
)
You can’t perform that action at this time.
0 commit comments