@@ -74,34 +74,37 @@ def create_dataset(
74
74
seed : int = 42 ,
75
75
repeats : int = 0 ,
76
76
input_img_mode : str = 'RGB' ,
77
+ trust_remote_code : bool = False ,
77
78
** kwargs ,
78
79
):
79
80
""" Dataset factory method
80
81
81
82
In parentheses after each arg are the type of dataset supported for each arg, one of:
82
- * folder - default, timm folder (or tar) based ImageDataset
83
- * torch - torchvision based datasets
83
+ * Folder - default, timm folder (or tar) based ImageDataset
84
+ * Torch - torchvision based datasets
84
85
* HFDS - Hugging Face Datasets
86
+ * HFIDS - Hugging Face Datasets Iterable (streaming mode, with IterableDataset)
85
87
* TFDS - Tensorflow-datasets wrapper in IterabeDataset interface via IterableImageDataset
86
88
* WDS - Webdataset
87
- * all - any of the above
89
+ * All - any of the above
88
90
89
91
Args:
90
- name: dataset name, empty is okay for folder based datasets
91
- root: root folder of dataset (all)
92
- split: dataset split (all)
93
- search_split: search for split specific child fold from root so one can specify
94
- `imagenet/` instead of `/imagenet/val`, etc on cmd line / config. (folder, torch/folder)
95
- class_map: specify class -> index mapping via text file or dict (folder)
96
- load_bytes: load data, return images as undecoded bytes (folder)
97
- download: download dataset if not present and supported (HFDS, TFDS, torch)
98
- is_training: create dataset in train mode, this is different from the split.
99
- For Iterable / TDFS it enables shuffle, ignored for other datasets. (TFDS, WDS)
100
- batch_size: batch size hint for (TFDS, WDS)
101
- seed: seed for iterable datasets (TFDS, WDS)
102
- repeats: dataset repeats per iteration i.e. epoch (TFDS, WDS)
103
- input_img_mode: Input image color conversion mode e.g. 'RGB', 'L' (folder, TFDS, WDS, HFDS)
104
- **kwargs: other args to pass to dataset
92
+ name: Dataset name, empty is okay for folder based datasets
93
+ root: Root folder of dataset (All)
94
+ split: Dataset split (All)
95
+ search_split: Search for split specific child fold from root so one can specify
96
+ `imagenet/` instead of `/imagenet/val`, etc on cmd line / config. (Folder, Torch)
97
+ class_map: Specify class -> index mapping via text file or dict (Folder)
98
+ load_bytes: Load data, return images as undecoded bytes (Folder)
99
+ download: Download dataset if not present and supported (HFIDS, TFDS, Torch)
100
+ is_training: Create dataset in train mode, this is different from the split.
101
+ For Iterable / TDFS it enables shuffle, ignored for other datasets. (TFDS, WDS, HFIDS)
102
+ batch_size: Batch size hint for iterable datasets (TFDS, WDS, HFIDS)
103
+ seed: Seed for iterable datasets (TFDS, WDS, HFIDS)
104
+ repeats: Dataset repeats per iteration i.e. epoch (TFDS, WDS, HFIDS)
105
+ input_img_mode: Input image color conversion mode e.g. 'RGB', 'L' (folder, TFDS, WDS, HFDS, HFIDS)
106
+ trust_remote_code: Trust remote code in Hugging Face Datasets if True (HFDS, HFIDS)
107
+ **kwargs: Other args to pass through to underlying Dataset and/or Reader classes
105
108
106
109
Returns:
107
110
Dataset object
@@ -162,6 +165,7 @@ def create_dataset(
162
165
split = split ,
163
166
class_map = class_map ,
164
167
input_img_mode = input_img_mode ,
168
+ trust_remote_code = trust_remote_code ,
165
169
** kwargs ,
166
170
)
167
171
elif name .startswith ('hfids/' ):
@@ -177,7 +181,8 @@ def create_dataset(
177
181
repeats = repeats ,
178
182
seed = seed ,
179
183
input_img_mode = input_img_mode ,
180
- ** kwargs
184
+ trust_remote_code = trust_remote_code ,
185
+ ** kwargs ,
181
186
)
182
187
elif name .startswith ('tfds/' ):
183
188
ds = IterableImageDataset (
0 commit comments