Skip to content

Commit f53590c

Browse files
authored
remove remote code data option, ptb (#1632)
## Purpose ## * Fix CI tests and drop support for datasets not supported by `datasets` ## Background ## As of [datasets==4.0.0](https://github.com/huggingface/datasets/releases/tag/4.0.0), datasets with remote code are no longer supported. This includes datasets such as Penn Tree Bank ## Changes ## * Remove `trust_remote_code_data` option (no longer supported by `datasets` * Remote PTB dataset (this still has remote code, and can non longer be supported Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
1 parent 810d66d commit f53590c

File tree

6 files changed

+0
-49
lines changed

6 files changed

+0
-49
lines changed

src/llmcompressor/args/dataset_arguments.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -162,15 +162,6 @@ class DatasetArguments(CustomDatasetArguments):
162162
),
163163
},
164164
)
165-
trust_remote_code_data: bool = field(
166-
default=False,
167-
metadata={
168-
"help": "Whether or not to allow for datasets defined on the Hub using "
169-
"a dataset script. This option should only be set to True for "
170-
"repositories you trust and in which you have read the code, as it "
171-
"will execute code present on the Hub on your local machine."
172-
},
173-
)
174165
# --- pipeline arguments --- #
175166
pipeline: Optional[str] = field(
176167
default="independent",

src/llmcompressor/entrypoints/oneshot.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,6 @@ def oneshot(
227227
overwrite_cache: bool = False,
228228
preprocessing_num_workers: Optional[int] = None,
229229
min_tokens_per_module: Optional[float] = None,
230-
trust_remote_code_data: bool = False,
231230
# Miscellaneous arguments
232231
output_dir: Optional[str] = None,
233232
log_dir: Optional[str] = "sparse_logs",
@@ -289,8 +288,6 @@ def oneshot(
289288
preprocessing.
290289
:param min_tokens_per_module: Minimum percentage of tokens per
291290
module, relevant for MoE models.
292-
:param trust_remote_code_data: Whether to allow for datasets defined on the Hub
293-
using a dataset script.
294291
295292
# Miscellaneous arguments
296293
:param output_dir: Path to save the output model after calibration.

src/llmcompressor/transformers/finetune/data/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,5 @@
99
from .gsm8k import GSM8KDataset
1010
from .open_platypus import OpenPlatypusDataset
1111
from .peoples_speech import PeoplesSpeech
12-
from .ptb import PtbDataset
1312
from .ultrachat_200k import UltraChatDataset
1413
from .wikitext import WikiTextDataset

src/llmcompressor/transformers/finetune/data/data_helpers.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ def get_raw_dataset(
3232
dataset_args.dataset_config_name,
3333
cache_dir=cache_dir,
3434
streaming=streaming,
35-
trust_remote_code=dataset_args.trust_remote_code_data,
3635
**kwargs,
3736
)
3837
return raw_datasets

src/llmcompressor/transformers/finetune/data/ptb.py

Lines changed: 0 additions & 32 deletions
This file was deleted.

tests/llmcompressor/transformers/finetune/data/test_dataset_loading.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,6 @@ def prepare_fixture(self, tiny_llama_tokenizer):
154154

155155
@parameterized.expand(
156156
[
157-
["ptb", "penn_treebank", "train[:5%]", False],
158157
["gsm8k", "main", "train[:5%]", True],
159158
["ultrachat_200k", "default", "train_sft[:1%]", False],
160159
]
@@ -164,7 +163,6 @@ def test_datasets(self, dataset_key, dataset_config, split, do_concat):
164163
dataset=dataset_key,
165164
dataset_config_name=dataset_config,
166165
concatenate_data=do_concat,
167-
trust_remote_code_data=True,
168166
)
169167
manager = TextGenerationDataset.load_from_registry(
170168
dataset_args.dataset,
@@ -270,7 +268,6 @@ def test_split_loading(self, split_def):
270268
dataset_args = DatasetArguments(
271269
dataset="open_platypus",
272270
splits=split_def,
273-
trust_remote_code_data=True,
274271
)
275272

276273
dataset = get_processed_dataset(

0 commit comments

Comments
 (0)