Skip to content

Commit 6bffbfc

Browse files
authored
[Flux] Remove Flux dataloader error handling to avoid confusion (#1290)
As titled. We can not achieve a robust dataloader, because: huggingface/datasets#7612
1 parent cddd7dc commit 6bffbfc

File tree

1 file changed

+6
-7
lines changed

1 file changed

+6
-7
lines changed

torchtitan/experiments/flux/dataset/flux_dataset.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -205,9 +205,15 @@ def _get_data_iter(self):
205205
def __iter__(self):
206206
dataset_iterator = self._get_data_iter()
207207
while True:
208+
# TODO: Add support for robust data loading and error handling.
209+
# Currently, we assume the dataset is well-formed and does not contain corrupted samples.
210+
# If a corrupted sample is encountered, the program will crash and throw an exception.
211+
# You can NOT try to catch the exception and continue, becuase the iterator within dataset
212+
# is not broken after raising an exception, so calling next() will thorw StopIteration and might cause re-loop.
208213
try:
209214
sample = next(dataset_iterator)
210215
except StopIteration:
216+
# We are asumming the program hits here only when reaching the end of the dataset.
211217
if not self.infinite:
212218
logger.warning(
213219
f"Dataset {self.dataset_name} has run out of data. \
@@ -220,13 +226,6 @@ def __iter__(self):
220226
logger.info(f"Dataset {self.dataset_name} is being re-looped.")
221227
dataset_iterator = self._get_data_iter()
222228
continue
223-
except (UnicodeDecodeError, SyntaxError, OSError) as e:
224-
# Handle other exception, eg, dataset corruption
225-
logger.warning(
226-
f"Dataset {self.dataset_name} has error while loading batch data. \
227-
Error {type(e).__name__}: {e}. The error could be the result of a streaming glitch."
228-
)
229-
continue
230229

231230
# Use the dataset-specific preprocessor
232231
sample_dict = self._data_processor(

0 commit comments

Comments
 (0)