meta-pytorch · RdoubleA · Sep 3, 2024 · Aug 29, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/recipes/configs/mistral/7B_full_ppo_low_memory.yaml b/recipes/configs/mistral/7B_full_ppo_low_memory.yaml
@@ -34,7 +34,6 @@ tokenizer:
 dataset:
   _component_: torchtune.datasets.text_completion_dataset
   source: trl-internal-testing/sentiment-trl-style
-  max_seq_len: null
   split: train
   column: prompt
   add_eos: False

diff --git a/...s/test_ppo_full_tunetune_single_device.py → ...s/test_ppo_full_finetune_single_device.py b/...s/test_ppo_full_tunetune_single_device.py → ...s/test_ppo_full_finetune_single_device.py
@@ -43,6 +43,7 @@ def _get_test_config_overrides(self):
             "tokenizer.path=/tmp/test-artifacts/tokenizer.model",
             "tokenizer._component_=torchtune.models.llama2.llama2_tokenizer",
             "tokenizer.prompt_template=null",
+            "tokenizer.max_seq_len=64",
             "seed=9",
             "optimizer=torch.optim.AdamW",
             "optimizer.lr=2e-5",

diff --git a/tests/recipes/utils.py b/tests/recipes/utils.py
@@ -72,7 +72,6 @@ def dummy_text_completion_alpaca_dataset_config():
         f"dataset.data_files={data_files}",
         "dataset.column='instruction'",
         "dataset.split='train[:10%]'",  # 10% of the dataset gets us 8 batches
-        "dataset.max_seq_len=64",
         "dataset.add_eos=False",
     ]
     return out

diff --git a/torchtune/datasets/_text_completion.py b/torchtune/datasets/_text_completion.py
@@ -29,9 +29,6 @@ class TextCompletionDataset(Dataset):
             for Hugging Face datasets or tabular data. For local datasets with a single column
             (e.g. unstructured txt files), use the default "text" which is used by Hugging Face datasets
             when loaded into memory. Default is "text".
-        max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists.
-            Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory
-            and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length.
         add_eos (bool): Whether to add an EOS token to the end of the sequence. Default is True.
         filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See
             the Hugging Face `docs <https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter>`_ for more
@@ -45,14 +42,12 @@ def __init__(
         tokenizer: ModelTokenizer,
         source: str,
         column: str = "text",
-        max_seq_len: Optional[int] = None,
         add_eos: bool = True,
         filter_fn: Optional[Callable] = None,
         **load_dataset_kwargs: Dict[str, Any],
     ) -> None:
         self._tokenizer = tokenizer
         self._data = load_dataset(source, **load_dataset_kwargs)
-        self.max_seq_len = max_seq_len
         self._column = column
         self.add_eos = add_eos
 
@@ -72,8 +67,8 @@ def _prepare_sample(self, sample: Mapping[str, Any]) -> Dict[str, List[int]]:
         tokens = self._tokenizer.encode(text=prompt, add_bos=True, add_eos=self.add_eos)
 
         # Truncate if needed, but don't coerce EOS id
-        if self.max_seq_len is not None:
-            tokens = truncate(tokens, self.max_seq_len - 1)
+        if self._tokenizer.max_seq_len is not None:
+            tokens = truncate(tokens, self._tokenizer.max_seq_len - 1)
 
         # No need to offset labels by 1 - happens in the recipe
         labels = tokens.copy()
@@ -85,10 +80,10 @@ def text_completion_dataset(
     tokenizer: ModelTokenizer,
     source: str,
     column: str = "text",
-    max_seq_len: Optional[int] = None,
     add_eos: bool = True,
     packed: bool = False,
     split_across_pack: bool = True,
+    split: str = "train",
     filter_fn: Optional[Callable] = None,
     **load_dataset_kwargs: Dict[str, Any],
 ) -> Union[TextCompletionDataset, PackedDataset]:
@@ -109,16 +104,15 @@ def text_completion_dataset(
             for Hugging Face datasets or tabular data. For local datasets with a single column
             (e.g. unstructured txt files), use the default "text" which is used by Hugging Face datasets
             when loaded into memory. Default is "text".
-        max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists.
-            Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory
-            and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length.
         add_eos (bool): Whether to add an EOS token to the end of the sequence. Default is True.
         packed (bool): Whether or not to pack the dataset to ``max_seq_len`` prior to training. Default is False.
         split_across_pack (bool): if the last sample in a pack does not fit in ``max_seq_len``,
             split the sample into the next pack, or move it entirely to the beginning of the next pack.
             For pre-training, typically this is set to True for general text completion. For
             fine-tuning, typically this is set to False to avoid truncating sentences in instruct
             tuning. This argument is ignored if ``packed=False``. Default is True.
+        split (str): ``split`` argument for ``datasets.load_dataset``. You can use this argument to load a subset
+            of a given split, e.g. ``split="train[:10%]"``. Default is "train".
         filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See
             the Hugging Face `docs <https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter>`_ for more
             details.
@@ -130,9 +124,9 @@ def text_completion_dataset(
         ...   tokenizer=tokenizer,
         ...   source="allenai/c4",
         ...   column="text",
-        ...   max_seq_len=2096,
         ...   data_dir="realnewslike",
         ...   packed=False,
+        ...   split="train",
         ... )
 
     This can also be accomplished via the yaml config::
@@ -141,30 +135,32 @@ def text_completion_dataset(
             _component_: torchtune.datasets.text_completion_dataset
             source: allenai/c4
             column: text
-            max_seq_len: 2096
             data_dir: realnewslike
             packed: False
+            split: train
 
     Returns:
         Union[TextCompletionDataset, PackedDataset]: the configured :class:`~torchtune.datasets.TextCompletionDataset`
             or :class:`~torchtune.datasets.PackedDataset` if ``packed=True``
+
+    Raises:
+        ValueError: If ``packed=True`` and ``tokenizer.max_seq_len`` is not set.
     """
     ds = TextCompletionDataset(
         tokenizer=tokenizer,
         source=source,
         column=column,
-        max_seq_len=max_seq_len,
         add_eos=add_eos,
+        split=split,
         filter_fn=filter_fn,
         **load_dataset_kwargs,
     )
-    return (
-        PackedDataset(
-            ds,
-            max_seq_len=max_seq_len,
-            padding_idx=tokenizer.pad_id,
-            split_across_pack=split_across_pack,
+    if packed:
+        if tokenizer.max_seq_len is None:
+            raise ValueError(
+                "PackedDataset requires a max_seq_len to be set on the tokenizer."
+            )
+        return PackedDataset(
+            ds, max_seq_len=tokenizer.max_seq_len, split_across_pack=split_across_pack
         )
-        if packed
-        else ds
-    )
+    return ds