[Data] fix RandomAccessDataset.multiget returning unexpected values for missing keys (#44769)

tespent · web-flow · commit 647b74a38fca · 2025-03-25T09:45:42.000-07:00
As stated in #44768, the current implementation of `multiget` based on `np.searchsorted` does not check for missing keys. I added the required checks and updated unit test for this case. ## Why are these changes needed?  ## Related issue number  Closes #44768 ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [x] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [x] Unit tests - [ ] Release tests - [ ] This PR is not tested :( Signed-off-by: Wu Yufei <wuyufei.2000@bytedance.com>
diff --git a/python/ray/data/random_access_dataset.py b/python/ray/data/random_access_dataset.py
@@ -234,8 +234,10 @@ def multiget(self, block_indices, keys):
             col = block[self.key_field]
             indices = np.searchsorted(col, keys)
             acc = BlockAccessor.for_block(block)
-            result = [acc._get_row(i) for i in indices]
-            # assert result == [self._get(i, k) for i, k in zip(block_indices, keys)]
+            result = [
+                acc._get_row(i) if k1.as_py() == k2 else None
+                for i, k1, k2 in zip(indices, col.take(indices), keys)
+            ]
         else:
             result = [self._get(i, k) for i, k in zip(block_indices, keys)]
         self.total_time += time.perf_counter() - start
diff --git a/python/ray/data/tests/test_random_access.py b/python/ray/data/tests/test_random_access.py
@@ -8,26 +8,28 @@
 @pytest.mark.parametrize("pandas", [False, True])
 def test_basic(ray_start_regular_shared, pandas):
     ds = ray.data.range(100, override_num_blocks=10)
+    ds = ds.add_column("key", lambda b: b["id"] * 2)
     ds = ds.add_column("embedding", lambda b: b["id"] ** 2)
     if not pandas:
         ds = ds.map_batches(
             lambda df: pyarrow.Table.from_pandas(df), batch_format="pandas"
         )
 
-    rad = ds.to_random_access_dataset("id", num_workers=1)
+    rad = ds.to_random_access_dataset("key", num_workers=1)
+
+    def expected(i):
+        return {"id": i, "key": i * 2, "embedding": i**2}
 
     # Test get.
     assert ray.get(rad.get_async(-1)) is None
-    assert ray.get(rad.get_async(100)) is None
+    assert ray.get(rad.get_async(200)) is None
     for i in range(100):
-        assert ray.get(rad.get_async(i)) == {"id": i, "embedding": i**2}
-
-    def expected(i):
-        return {"id": i, "embedding": i**2}
+        assert ray.get(rad.get_async(i * 2 + 1)) is None
+        assert ray.get(rad.get_async(i * 2)) == expected(i)
 
     # Test multiget.
-    results = rad.multiget([-1] + list(range(10)) + [100])
-    assert results == [None] + [expected(i) for i in range(10)] + [None]
+    results = rad.multiget([-1] + list(range(0, 20, 2)) + list(range(1, 21, 2)) + [200])
+    assert results == [None] + [expected(i) for i in range(10)] + [None] * 10 + [None]
 
 
 def test_empty_blocks(ray_start_regular_shared):