[Data] Fixing Optimizer to apply rules until plan stabilize; (ray-project#52663)

alexeykudinkin · zhaoch23 · commit ced3a8b86172 · 2025-05-14T16:36:53.000Z
Fixing tests   ## Why are these changes needed? 1. Fixing `Optimizer` to continue applying rules until plans stop changing 2. Fixing tests to avoid making assumption about number of blocks / data being sorted 3. Minor cleanups --------- Signed-off-by: Alexey Kudinkin <ak@anyscale.com> Signed-off-by: zhaoch23 <c233zhao@uwaterloo.ca>
diff --git a/python/ray/data/_internal/execution/operators/map_transformer.py b/python/ray/data/_internal/execution/operators/map_transformer.py
@@ -90,9 +90,7 @@ def output_block_size_option(self):
         return self._output_block_size_option
 
     def set_target_max_block_size(self, target_max_block_size: int):
-        assert (
-            self._output_block_size_option is None and target_max_block_size is not None
-        )
+        assert target_max_block_size is not None
         self._output_block_size_option = OutputBlockSizeOption(
             target_max_block_size=target_max_block_size
         )
@@ -105,10 +103,7 @@ def target_max_block_size(self):
             return self._output_block_size_option.target_max_block_size
 
     def set_target_num_rows_per_block(self, target_num_rows_per_block: int):
-        assert (
-            self._output_block_size_option is None
-            and target_num_rows_per_block is not None
-        )
+        assert target_num_rows_per_block is not None
         self._output_block_size_option = OutputBlockSizeOption(
             target_num_rows_per_block=target_num_rows_per_block
         )
diff --git a/python/ray/data/_internal/logical/interfaces/operator.py b/python/ray/data/_internal/logical/interfaces/operator.py
@@ -1,4 +1,4 @@
-from typing import Iterator, List, Callable
+from typing import Callable, Iterator, List
 
 
 class Operator:
diff --git a/python/ray/data/_internal/logical/interfaces/optimizer.py b/python/ray/data/_internal/logical/interfaces/optimizer.py
@@ -34,6 +34,14 @@ def rules(self) -> List[Rule]:
 
     def optimize(self, plan: Plan) -> Plan:
         """Optimize operators with a list of rules."""
-        for rule in self.rules:
-            plan = rule.apply(plan)
+        # Apply rules until the plan is not changed
+        previous_plan = plan
+        while True:
+            for rule in self.rules:
+                plan = rule.apply(plan)
+            # TODO: Eventually we should implement proper equality.
+            # Using str to check equality seems brittle
+            if plan.dag.dag_str == previous_plan.dag.dag_str:
+                break
+            previous_plan = plan
         return plan
diff --git a/python/ray/data/_internal/util.py b/python/ray/data/_internal/util.py
@@ -785,13 +785,14 @@ def find_partition_index(
             # is an index into the ascending order of ``col_vals``, so we need
             # to subtract it from ``len(col_vals)`` to get the index in the
             # original descending order of ``col_vals``.
+            sorter = np.arange(len(col_vals) - 1, -1, -1)
             left = prevleft + (
                 len(col_vals)
                 - np.searchsorted(
                     col_vals,
                     desired_val,
                     side="right",
-                    sorter=np.arange(len(col_vals) - 1, -1, -1),
+                    sorter=sorter,
                 )
             )
             right = prevleft + (
@@ -800,7 +801,7 @@ def find_partition_index(
                     col_vals,
                     desired_val,
                     side="left",
-                    sorter=np.arange(len(col_vals) - 1, -1, -1),
+                    sorter=sorter,
                 )
             )
         else:
diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
@@ -108,7 +108,6 @@
 
     from ray.data._internal.datasource.tfrecords_datasource import TFXReadOptions
 
-
 T = TypeVar("T")
 
 logger = logging.getLogger(__name__)
diff --git a/python/ray/data/tests/test_context.py b/python/ray/data/tests/test_context.py
@@ -1,5 +1,6 @@
 import pytest
 
+
 import ray
 
 
diff --git a/python/ray/data/tests/test_execution_optimizer.py b/python/ray/data/tests/test_execution_optimizer.py
@@ -68,7 +68,7 @@ def _check_valid_plan_and_result(
     expected_physical_plan_ops=None,
 ):
     assert ds.take_all() == expected_result
-    assert str(ds._plan._logical_plan.dag) == expected_plan
+    assert ds._plan._logical_plan.dag.dag_str == expected_plan
 
     expected_physical_plan_ops = expected_physical_plan_ops or []
     for op in expected_physical_plan_ops:
diff --git a/python/ray/data/tests/test_json.py b/python/ray/data/tests/test_json.py
@@ -37,7 +37,7 @@ def test_json_read_partitioning(ray_start_regular_shared, tmp_path):
 
     ds = ray.data.read_json(path)
 
-    assert ds.take() == [
+    assert sorted(ds.take(), key=lambda row: row["number"]) == [
         {"number": 0, "string": "foo", "country": "us"},
         {"number": 1, "string": "bar", "country": "us"},
     ]
@@ -103,7 +103,7 @@ def test_json_read(ray_start_regular_shared, fs, data_path, endpoint_url):
     df2.to_json(path2, orient="records", lines=True, storage_options=storage_options)
     ds = ray.data.read_json(path, filesystem=fs)
     df = pd.concat([df1, df2], ignore_index=True)
-    dsdf = ds.to_pandas()
+    dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True)
     assert df.equals(dsdf)
     if fs is None:
         shutil.rmtree(path)
@@ -136,7 +136,7 @@ def test_json_read(ray_start_regular_shared, fs, data_path, endpoint_url):
     )
     ds = ray.data.read_json([path1, path2], filesystem=fs)
     df = pd.concat([df1, df2, df3], ignore_index=True)
-    dsdf = ds.to_pandas()
+    dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True)
     assert df.equals(dsdf)
     if fs is None:
         shutil.rmtree(path1)
@@ -159,7 +159,7 @@ def test_json_read(ray_start_regular_shared, fs, data_path, endpoint_url):
     df2.to_json(path2, orient="records", lines=True, storage_options=storage_options)
     ds = ray.data.read_json([dir_path, path2], filesystem=fs)
     df = pd.concat([df1, df2], ignore_index=True)
-    dsdf = ds.to_pandas()
+    dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True)
     assert df.equals(dsdf)
     if fs is None:
         shutil.rmtree(dir_path)
@@ -189,9 +189,8 @@ def test_json_read(ray_start_regular_shared, fs, data_path, endpoint_url):
     )
 
     ds = ray.data.read_json(path, filesystem=fs)
-    assert ds._plan.initial_num_blocks() == 2
     df = pd.concat([df1, df2], ignore_index=True)
-    dsdf = ds.to_pandas()
+    dsdf = ds.to_pandas().sort_values(by=["one", "two"]).reset_index(drop=True)
     assert df.equals(dsdf)
     if fs is None:
         shutil.rmtree(path)
@@ -410,7 +409,9 @@ def test_json_read_with_parse_options(
         (lazy_fixture("s3_fs"), lazy_fixture("s3_path"), lazy_fixture("s3_server")),
     ],
 )
+@pytest.mark.parametrize("style", [PartitionStyle.HIVE, PartitionStyle.DIRECTORY])
 def test_json_read_partitioned_with_filter(
+    style,
     ray_start_regular_shared,
     fs,
     data_path,
@@ -476,12 +477,13 @@ def skip_unpartitioned(kv_dict):
         ray.get(skipped_file_counter.reset.remote())
 
 
-def test_jsonl_mixed_types(ray_start_regular_shared, tmp_path):
+@pytest.mark.parametrize("override_num_blocks", [None, 1, 3])
+def test_jsonl_lists(ray_start_regular_shared, tmp_path, override_num_blocks):
     """Test JSONL with mixed types and schemas."""
     data = [
-        {"a": 1, "b": {"c": 2}},  # Nested dict
-        {"a": 1, "b": {"c": 3}},  # Nested dict
-        {"a": 1, "b": {"c": {"hello": "world"}}},  # Mixed Schema
+        ["ray", "rocks", "hello"],
+        ["oh", "no"],
+        ["rocking", "with", "ray"],
     ]
 
     path = os.path.join(tmp_path, "test.jsonl")
@@ -490,21 +492,20 @@ def test_jsonl_mixed_types(ray_start_regular_shared, tmp_path):
             json.dump(record, f)
             f.write("\n")
 
-    ds = ray.data.read_json(path, lines=True)
+    ds = ray.data.read_json(path, lines=True, override_num_blocks=override_num_blocks)
     result = ds.take_all()
 
-    assert result[0] == data[0]  # Dict stays as is
-    assert result[1] == data[1]
-    assert result[2] == data[2]
+    assert result[0] == {"0": "ray", "1": "rocks", "2": "hello"}
+    assert result[1] == {"0": "oh", "1": "no", "2": None}
+    assert result[2] == {"0": "rocking", "1": "with", "2": "ray"}
 
 
-@pytest.mark.parametrize("override_num_blocks", [None, 1, 3])
-def test_jsonl_lists(ray_start_regular_shared, tmp_path, override_num_blocks):
+def test_jsonl_mixed_types(ray_start_regular_shared, tmp_path):
     """Test JSONL with mixed types and schemas."""
     data = [
-        ["ray", "rocks", "hello"],
-        ["oh", "no"],
-        ["rocking", "with", "ray"],
+        {"a": 1, "b": {"c": 2}},  # Nested dict
+        {"a": 1, "b": {"c": 3}},  # Nested dict
+        {"a": 1, "b": {"c": {"hello": "world"}}},  # Mixed Schema
     ]
 
     path = os.path.join(tmp_path, "test.jsonl")
@@ -513,12 +514,12 @@ def test_jsonl_lists(ray_start_regular_shared, tmp_path, override_num_blocks):
             json.dump(record, f)
             f.write("\n")
 
-    ds = ray.data.read_json(path, lines=True, override_num_blocks=override_num_blocks)
+    ds = ray.data.read_json(path, lines=True)
     result = ds.take_all()
 
-    assert result[0] == {"0": "ray", "1": "rocks", "2": "hello"}
-    assert result[1] == {"0": "oh", "1": "no", "2": None}
-    assert result[2] == {"0": "rocking", "1": "with", "2": "ray"}
+    assert result[0] == data[0]  # Dict stays as is
+    assert result[1] == data[1]
+    assert result[2] == data[2]
 
 
 @pytest.mark.parametrize(
diff --git a/python/ray/data/tests/test_webdataset.py b/python/ray/data/tests/test_webdataset.py
@@ -39,7 +39,7 @@ def test_webdataset_read(ray_start_2_cpus, tmp_path):
             tf.write(f"{i}.b", str(i**2).encode("utf-8"))
     assert os.path.exists(path)
     assert len(glob.glob(f"{tmp_path}/*.tar")) == 1
-    ds = ray.data.read_webdataset(paths=[str(tmp_path)], override_num_blocks=1)
+    ds = ray.data.read_webdataset(paths=[str(tmp_path)])
     samples = ds.take(100)
     assert len(samples) == 100
     for i, sample in enumerate(samples):
@@ -92,18 +92,14 @@ def test_webdataset_suffixes(ray_start_2_cpus, tmp_path):
     assert len(glob.glob(f"{tmp_path}/*.tar")) == 1
 
     # test simple suffixes
-    ds = ray.data.read_webdataset(
-        paths=[str(tmp_path)], override_num_blocks=1, suffixes=["txt", "cls"]
-    )
+    ds = ray.data.read_webdataset(paths=[str(tmp_path)], suffixes=["txt", "cls"])
     samples = ds.take(100)
     assert len(samples) == 100
     for i, sample in enumerate(samples):
         assert set(sample.keys()) == {"__url__", "__key__", "txt", "cls"}
 
     # test fnmatch patterns for suffixes
-    ds = ray.data.read_webdataset(
-        paths=[str(tmp_path)], override_num_blocks=1, suffixes=["*.txt", "*.cls"]
-    )
+    ds = ray.data.read_webdataset(paths=[str(tmp_path)], suffixes=["*.txt", "*.cls"])
     samples = ds.take(100)
     assert len(samples) == 100
     for i, sample in enumerate(samples):
@@ -113,9 +109,7 @@ def test_webdataset_suffixes(ray_start_2_cpus, tmp_path):
     def select(name):
         return name.endswith("txt")
 
-    ds = ray.data.read_webdataset(
-        paths=[str(tmp_path)], override_num_blocks=1, suffixes=select
-    )
+    ds = ray.data.read_webdataset(paths=[str(tmp_path)], suffixes=select)
     samples = ds.take(100)
     assert len(samples) == 100
     for i, sample in enumerate(samples):
@@ -127,9 +121,7 @@ def renamer(name):
         print("***", name, result)
         return result
 
-    ds = ray.data.read_webdataset(
-        paths=[str(tmp_path)], override_num_blocks=1, filerename=renamer
-    )
+    ds = ray.data.read_webdataset(paths=[str(tmp_path)], filerename=renamer)
     samples = ds.take(100)
     assert len(samples) == 100
     for i, sample in enumerate(samples):
@@ -198,7 +190,7 @@ def test_webdataset_coding(ray_start_2_cpus, tmp_path):
     assert len(paths) == 1
     path = paths[0]
     assert os.path.exists(path)
-    ds = ray.data.read_webdataset(paths=[str(tmp_path)], override_num_blocks=1)
+    ds = ray.data.read_webdataset(paths=[str(tmp_path)])
     samples = ds.take(1)
     assert len(samples) == 1
     for sample in samples:
@@ -218,7 +210,7 @@ def test_webdataset_coding(ray_start_2_cpus, tmp_path):
 
     # test the format argument to the default decoder and multiple decoders
     ds = ray.data.read_webdataset(
-        paths=[str(tmp_path)], override_num_blocks=1, decoder=["PIL", custom_decoder]
+        paths=[str(tmp_path)], decoder=["PIL", custom_decoder]
     )
     samples = ds.take(1)
     assert len(samples) == 1

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from typing import Iterator, List, Callable`
	`1`	`+from typing import Callable, Iterator, List`
`2`	`2`
`3`	`3`
`4`	`4`	`class Operator:`
Original file line number	Diff line number	Diff line change
`@@ -785,13 +785,14 @@ def find_partition_index(`
`785`	`785`	# is an index into the ascending order of ``col_vals``, so we need
`786`	`786`	# to subtract it from ``len(col_vals)`` to get the index in the
`787`	`787`	# original descending order of ``col_vals``.
	`788`	`+ sorter = np.arange(len(col_vals) - 1, -1, -1)`
`788`	`789`	`left = prevleft + (`
`789`	`790`	`len(col_vals)`
`790`	`791`	`- np.searchsorted(`
`791`	`792`	`col_vals,`
`792`	`793`	`desired_val,`
`793`	`794`	`side="right",`
`794`		`- sorter=np.arange(len(col_vals) - 1, -1, -1),`
	`795`	`+ sorter=sorter,`
`795`	`796`	`)`
`796`	`797`	`)`
`797`	`798`	`right = prevleft + (`
`@@ -800,7 +801,7 @@ def find_partition_index(`
`800`	`801`	`col_vals,`
`801`	`802`	`desired_val,`
`802`	`803`	`side="left",`
`803`		`- sorter=np.arange(len(col_vals) - 1, -1, -1),`
	`804`	`+ sorter=sorter,`
`804`	`805`	`)`
`805`	`806`	`)`
`806`	`807`	`else:`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,6 @@`
`1`	`1`	`import pytest`
`2`	`2`
	`3`	`+`
`3`	`4`	`import ray`
`4`	`5`
`5`	`6`