[BugFix] Fix minari dataloading (#3054)

vmoens · web-flow · commit c166c4ff8e67 · 2025-07-10T15:03:17.000+01:00
diff --git a/.github/unittest/linux_libs/scripts_openx/environment.yml b/.github/unittest/linux_libs/scripts_openx/environment.yml
@@ -21,5 +21,5 @@ dependencies:
     - hydra-core
     - tqdm
     - h5py
-    - datasets
+    - datasets<4.0.0
     - pillow
diff --git a/test/test_libs.py b/test/test_libs.py
@@ -3342,33 +3342,57 @@ def test_d4rl_iteration(self, task, split_trajs):
 _MINARI_DATASETS = []
 
 
-def _minari_selected_datasets():
-    if not _has_minari or not _has_gymnasium:
-        return
+def _minari_init():
+    """Initialize Minari datasets list. Returns True if already initialized."""
     global _MINARI_DATASETS
-    import minari
+    if _MINARI_DATASETS and not all(
+        isinstance(x, str) and x.isdigit() for x in _MINARI_DATASETS
+    ):
+        return True  # Already initialized with real dataset names
 
-    torch.manual_seed(0)
+    if not _has_minari or not _has_gymnasium:
+        return False
 
-    total_keys = sorted(
-        minari.list_remote_datasets(latest_version=True, compatible_minari_version=True)
-    )
-    indices = torch.randperm(len(total_keys))[:20]
-    keys = [total_keys[idx] for idx in indices]
+    try:
+        import minari
+
+        torch.manual_seed(0)
 
-    assert len(keys) > 5, keys
-    _MINARI_DATASETS += keys
+        total_keys = sorted(
+            minari.list_remote_datasets(
+                latest_version=True, compatible_minari_version=True
+            )
+        )
+        indices = torch.randperm(len(total_keys))[:20]
+        keys = [total_keys[idx] for idx in indices]
 
+        assert len(keys) > 5, keys
+        _MINARI_DATASETS[:] = keys  # Replace the placeholder values
+        return True
+    except Exception:
+        return False
 
-_minari_selected_datasets()
+
+# Initialize with placeholder values for parametrization
+# These will be replaced with actual dataset names when the first Minari test runs
+_MINARI_DATASETS = [str(i) for i in range(20)]
 
 
 @pytest.mark.skipif(not _has_minari or not _has_gymnasium, reason="Minari not found")
 @pytest.mark.slow
 class TestMinari:
     @pytest.mark.parametrize("split", [False, True])
-    @pytest.mark.parametrize("selected_dataset", _MINARI_DATASETS)
-    def test_load(self, selected_dataset, split):
+    @pytest.mark.parametrize("dataset_idx", range(20))
+    def test_load(self, dataset_idx, split):
+        # Initialize Minari datasets if not already done
+        if not _minari_init():
+            pytest.skip("Failed to initialize Minari datasets")
+
+        # Get the actual dataset name from the initialized list
+        if dataset_idx >= len(_MINARI_DATASETS):
+            pytest.skip(f"Dataset index {dataset_idx} out of range")
+
+        selected_dataset = _MINARI_DATASETS[dataset_idx]
         torchrl_logger.info(f"dataset {selected_dataset}")
         data = MinariExperienceReplay(
             selected_dataset, batch_size=32, split_trajs=split
diff --git a/torchrl/data/datasets/minari_data.py b/torchrl/data/datasets/minari_data.py
@@ -463,5 +463,9 @@ def _patch_info(info_td):
     val_td_sel = val_td_sel.apply(
         lambda x: torch.cat([torch.zeros_like(x[:1]), x], 0), batch_size=[min_shape + 1]
     )
-    val_td_sel.update(val_td.select(*unique_shapes[max_shape]))
+    source = val_td.select(*unique_shapes[max_shape])
+    # make sure source has no batch size
+    source.batch_size = ()
+    if not source.is_empty():
+        val_td_sel.update(source, update_batch_size=True)
     return val_td_sel
diff --git a/torchrl/data/datasets/openx.py b/torchrl/data/datasets/openx.py
@@ -577,9 +577,17 @@ def _init(self):
             )
         import datasets
 
-        dataset = datasets.load_dataset(
-            self.repo, self.dataset_id, streaming=True, split=self.split
-        )
+        try:
+            dataset = datasets.load_dataset(
+                self.repo, self.dataset_id, streaming=True, split=self.split
+            )
+        except Exception as e:
+            if "Dataset scripts are no longer supported" in str(e):
+                raise RuntimeError(
+                    f"Failed to load dataset {self.dataset_id}. Your version of `datasets` is too new - please downgrade to <4.0.0."
+                ) from e
+            raise e
+
         if self.shuffle:
             dataset = dataset.shuffle()
         self.dataset = dataset