remove remote code data option, ptb (#1632)

kylesayrs · web-flow · commit f53590cccebd · 2025-07-09T17:35:53.000-04:00
## Purpose ## * Fix CI tests and drop support for datasets not supported by `datasets` ## Background ## As of [datasets==4.0.0](https://github.com/huggingface/datasets/releases/tag/4.0.0), datasets with remote code are no longer supported. This includes datasets such as Penn Tree Bank ## Changes ## * Remove `trust_remote_code_data` option (no longer supported by `datasets` * Remote PTB dataset (this still has remote code, and can non longer be supported Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
@@ -162,15 +162,6 @@ class DatasetArguments(CustomDatasetArguments):
             ),
         },
     )
-    trust_remote_code_data: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether or not to allow for datasets defined on the Hub using "
-            "a dataset script. This option should only be set to True for "
-            "repositories you trust and in which you have read the code, as it "
-            "will execute code present on the Hub on your local machine."
-        },
-    )
     # --- pipeline arguments --- #
     pipeline: Optional[str] = field(
         default="independent",
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -227,7 +227,6 @@ def oneshot(
     overwrite_cache: bool = False,
     preprocessing_num_workers: Optional[int] = None,
     min_tokens_per_module: Optional[float] = None,
-    trust_remote_code_data: bool = False,
     # Miscellaneous arguments
     output_dir: Optional[str] = None,
     log_dir: Optional[str] = "sparse_logs",
@@ -289,8 +288,6 @@ def oneshot(
         preprocessing.
     :param min_tokens_per_module: Minimum percentage of tokens per
         module, relevant for MoE models.
-    :param trust_remote_code_data: Whether to allow for datasets defined on the Hub
-        using a dataset script.
 
     # Miscellaneous arguments
     :param output_dir: Path to save the output model after calibration.
diff --git a/src/llmcompressor/transformers/finetune/data/__init__.py b/src/llmcompressor/transformers/finetune/data/__init__.py
@@ -9,6 +9,5 @@
 from .gsm8k import GSM8KDataset
 from .open_platypus import OpenPlatypusDataset
 from .peoples_speech import PeoplesSpeech
-from .ptb import PtbDataset
 from .ultrachat_200k import UltraChatDataset
 from .wikitext import WikiTextDataset
diff --git a/src/llmcompressor/transformers/finetune/data/data_helpers.py b/src/llmcompressor/transformers/finetune/data/data_helpers.py
@@ -32,7 +32,6 @@ def get_raw_dataset(
         dataset_args.dataset_config_name,
         cache_dir=cache_dir,
         streaming=streaming,
-        trust_remote_code=dataset_args.trust_remote_code_data,
         **kwargs,
     )
     return raw_datasets
diff --git a/src/llmcompressor/transformers/finetune/data/ptb.py b/src/llmcompressor/transformers/finetune/data/ptb.py
diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py
@@ -154,7 +154,6 @@ def prepare_fixture(self, tiny_llama_tokenizer):
 
     @parameterized.expand(
         [
-            ["ptb", "penn_treebank", "train[:5%]", False],
             ["gsm8k", "main", "train[:5%]", True],
             ["ultrachat_200k", "default", "train_sft[:1%]", False],
         ]
@@ -164,7 +163,6 @@ def test_datasets(self, dataset_key, dataset_config, split, do_concat):
             dataset=dataset_key,
             dataset_config_name=dataset_config,
             concatenate_data=do_concat,
-            trust_remote_code_data=True,
         )
         manager = TextGenerationDataset.load_from_registry(
             dataset_args.dataset,
@@ -270,7 +268,6 @@ def test_split_loading(self, split_def):
         dataset_args = DatasetArguments(
             dataset="open_platypus",
             splits=split_def,
-            trust_remote_code_data=True,
         )
 
         dataset = get_processed_dataset(

Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,6 @@ def get_raw_dataset(`
`32`	`32`	`dataset_args.dataset_config_name,`
`33`	`33`	`cache_dir=cache_dir,`
`34`	`34`	`streaming=streaming,`
`35`		`- trust_remote_code=dataset_args.trust_remote_code_data,`
`36`	`35`	`**kwargs,`
`37`	`36`	`)`
`38`	`37`	`return raw_datasets`
Original file line number	Diff line number	Diff line change
`@@ -154,7 +154,6 @@ def prepare_fixture(self, tiny_llama_tokenizer):`
`154`	`154`
`155`	`155`	`@parameterized.expand(`
`156`	`156`	`[`
`157`		`- ["ptb", "penn_treebank", "train[:5%]", False],`
`158`	`157`	`["gsm8k", "main", "train[:5%]", True],`
`159`	`158`	`["ultrachat_200k", "default", "train_sft[:1%]", False],`
`160`	`159`	`]`
`@@ -164,7 +163,6 @@ def test_datasets(self, dataset_key, dataset_config, split, do_concat):`
`164`	`163`	`dataset=dataset_key,`
`165`	`164`	`dataset_config_name=dataset_config,`
`166`	`165`	`concatenate_data=do_concat,`
`167`		`- trust_remote_code_data=True,`
`168`	`166`	`)`
`169`	`167`	`manager = TextGenerationDataset.load_from_registry(`
`170`	`168`	`dataset_args.dataset,`
`@@ -270,7 +268,6 @@ def test_split_loading(self, split_def):`
`270`	`268`	`dataset_args = DatasetArguments(`
`271`	`269`	`dataset="open_platypus",`
`272`	`270`	`splits=split_def,`
`273`		`- trust_remote_code_data=True,`
`274`	`271`	`)`
`275`	`272`
`276`	`273`	`dataset = get_processed_dataset(`