[Flux] Remove Flux dataloader error handling to avoid confusion (#1290)

wwwjn · web-flow · commit 6bffbfc67d15 · 2025-06-13T13:21:24.000-07:00
As titled. We can not achieve a robust dataloader, because: huggingface/datasets#7612
diff --git a/torchtitan/experiments/flux/dataset/flux_dataset.py b/torchtitan/experiments/flux/dataset/flux_dataset.py
@@ -205,9 +205,15 @@ def _get_data_iter(self):
     def __iter__(self):
         dataset_iterator = self._get_data_iter()
         while True:
+            # TODO: Add support for robust data loading and error handling.
+            # Currently, we assume the dataset is well-formed and does not contain corrupted samples.
+            # If a corrupted sample is encountered, the program will crash and throw an exception.
+            # You can NOT try to catch the exception and continue, becuase the iterator within dataset
+            # is not broken after raising an exception, so calling next() will thorw StopIteration and might cause re-loop.
             try:
                 sample = next(dataset_iterator)
             except StopIteration:
+                # We are asumming the program hits here only when reaching the end of the dataset.
                 if not self.infinite:
                     logger.warning(
                         f"Dataset {self.dataset_name} has run out of data. \
@@ -220,13 +226,6 @@ def __iter__(self):
                     logger.info(f"Dataset {self.dataset_name} is being re-looped.")
                     dataset_iterator = self._get_data_iter()
                     continue
-            except (UnicodeDecodeError, SyntaxError, OSError) as e:
-                # Handle other exception, eg, dataset corruption
-                logger.warning(
-                    f"Dataset {self.dataset_name} has error while loading batch data. \
-                    Error {type(e).__name__}: {e}. The error could be the result of a streaming glitch."
-                )
-                continue
 
             # Use the dataset-specific preprocessor
             sample_dict = self._data_processor(