Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion recipes/configs/mistral/7B_full_ppo_low_memory.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ tokenizer:
dataset:
_component_: torchtune.datasets.text_completion_dataset
source: trl-internal-testing/sentiment-trl-style
max_seq_len: null
split: train
column: prompt
add_eos: False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def _get_test_config_overrides(self):
"tokenizer.path=/tmp/test-artifacts/tokenizer.model",
"tokenizer._component_=torchtune.models.llama2.llama2_tokenizer",
"tokenizer.prompt_template=null",
"tokenizer.max_seq_len=64",
"seed=9",
"optimizer=torch.optim.AdamW",
"optimizer.lr=2e-5",
Expand Down
1 change: 0 additions & 1 deletion tests/recipes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def dummy_text_completion_alpaca_dataset_config():
f"dataset.data_files={data_files}",
"dataset.column='instruction'",
"dataset.split='train[:10%]'", # 10% of the dataset gets us 8 batches
"dataset.max_seq_len=64",
"dataset.add_eos=False",
]
return out
Expand Down
42 changes: 19 additions & 23 deletions torchtune/datasets/_text_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ class TextCompletionDataset(Dataset):
for Hugging Face datasets or tabular data. For local datasets with a single column
(e.g. unstructured txt files), use the default "text" which is used by Hugging Face datasets
when loaded into memory. Default is "text".
max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists.
Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory
and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length.
add_eos (bool): Whether to add an EOS token to the end of the sequence. Default is True.
filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See
the Hugging Face `docs <https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter>`_ for more
Expand All @@ -45,14 +42,12 @@ def __init__(
tokenizer: ModelTokenizer,
source: str,
column: str = "text",
max_seq_len: Optional[int] = None,
add_eos: bool = True,
filter_fn: Optional[Callable] = None,
**load_dataset_kwargs: Dict[str, Any],
) -> None:
self._tokenizer = tokenizer
self._data = load_dataset(source, **load_dataset_kwargs)
self.max_seq_len = max_seq_len
self._column = column
self.add_eos = add_eos

Expand All @@ -72,8 +67,8 @@ def _prepare_sample(self, sample: Mapping[str, Any]) -> Dict[str, List[int]]:
tokens = self._tokenizer.encode(text=prompt, add_bos=True, add_eos=self.add_eos)

# Truncate if needed, but don't coerce EOS id
if self.max_seq_len is not None:
tokens = truncate(tokens, self.max_seq_len - 1)
if self._tokenizer.max_seq_len is not None:
tokens = truncate(tokens, self._tokenizer.max_seq_len - 1)

# No need to offset labels by 1 - happens in the recipe
labels = tokens.copy()
Expand All @@ -85,10 +80,10 @@ def text_completion_dataset(
tokenizer: ModelTokenizer,
source: str,
column: str = "text",
max_seq_len: Optional[int] = None,
add_eos: bool = True,
packed: bool = False,
split_across_pack: bool = True,
split: str = "train",
filter_fn: Optional[Callable] = None,
**load_dataset_kwargs: Dict[str, Any],
) -> Union[TextCompletionDataset, PackedDataset]:
Expand All @@ -109,16 +104,15 @@ def text_completion_dataset(
for Hugging Face datasets or tabular data. For local datasets with a single column
(e.g. unstructured txt files), use the default "text" which is used by Hugging Face datasets
when loaded into memory. Default is "text".
max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists.
Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory
and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length.
add_eos (bool): Whether to add an EOS token to the end of the sequence. Default is True.
packed (bool): Whether or not to pack the dataset to ``max_seq_len`` prior to training. Default is False.
split_across_pack (bool): if the last sample in a pack does not fit in ``max_seq_len``,
split the sample into the next pack, or move it entirely to the beginning of the next pack.
For pre-training, typically this is set to True for general text completion. For
fine-tuning, typically this is set to False to avoid truncating sentences in instruct
tuning. This argument is ignored if ``packed=False``. Default is True.
split (str): ``split`` argument for ``datasets.load_dataset``. You can use this argument to load a subset
of a given split, e.g. ``split="train[:10%]"``. Default is "train".
filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See
the Hugging Face `docs <https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter>`_ for more
details.
Expand All @@ -130,9 +124,9 @@ def text_completion_dataset(
... tokenizer=tokenizer,
... source="allenai/c4",
... column="text",
... max_seq_len=2096,
... data_dir="realnewslike",
... packed=False,
... split="train",
... )

This can also be accomplished via the yaml config::
Expand All @@ -141,30 +135,32 @@ def text_completion_dataset(
_component_: torchtune.datasets.text_completion_dataset
source: allenai/c4
column: text
max_seq_len: 2096
data_dir: realnewslike
packed: False
split: train

Returns:
Union[TextCompletionDataset, PackedDataset]: the configured :class:`~torchtune.datasets.TextCompletionDataset`
or :class:`~torchtune.datasets.PackedDataset` if ``packed=True``

Raises:
ValueError: If ``packed=True`` and ``tokenizer.max_seq_len`` is not set.
"""
ds = TextCompletionDataset(
tokenizer=tokenizer,
source=source,
column=column,
max_seq_len=max_seq_len,
add_eos=add_eos,
split=split,
filter_fn=filter_fn,
**load_dataset_kwargs,
)
return (
PackedDataset(
ds,
max_seq_len=max_seq_len,
padding_idx=tokenizer.pad_id,
split_across_pack=split_across_pack,
if packed:
if tokenizer.max_seq_len is None:
raise ValueError(
"PackedDataset requires a max_seq_len to be set on the tokenizer."
)
return PackedDataset(
ds, max_seq_len=tokenizer.max_seq_len, split_across_pack=split_across_pack
)
if packed
else ds
)
return ds
Loading