[Data] Poll memory usage per map task (#51324)

bveeramani · web-flow · commit d7601c1531e3 · 2025-03-18T19:38:56.000Z
## Why are these changes needed?  ### Context Currently, we estimate `BlockExecStats.rss_bytes` by checking the process memory usage after the UDF yields an output. However, at this point, variables might've already gone out of scope and gotten garbage collected. So, the estimate is often a large underestimate. ### Change This PR makes each `_map_task` launch a thread that polls the memory every 1s. This should provide a more accurate memory estimate. You can disable this feature by executing the following statement: ```python ray.data.DataContext.get_current().memory_poll_interval_s = None ``` ### Notes We considered several approaches to estimate the memory use per task: * `ru_maxrss`: High RSS watermark of a process. Inaccurate because Ray might reuse a worker process to execute two distinct UDFs, and RSS double-counts. * `memory_full_info().uss` with polling: Accurate, but slow. In our map release test, can take 0.1s+ per call (for comparison, a PyArrow UDF on a 128 MiB block can take <0.01s). * `memory_info().rss` with polling: Double-counts because it includes shared memory (like Ray objects). * `memory_info().rss - memory_info().shared` with polling. This is an estimate of the USS, and it's the approach we went with. On the map and batch inference release tests, the % difference from true USS is a few percent, except for the model prediction UDF (in this case, it was a 20% underestimate, likely due to counting shared memory from Torch). It takes ~0.0001s per call. Here's a chart of (`true_uss - (rss - shared)`) for the map batches release test. X-axis is value in MiB. ![Distribution Plot with Seaborn](https://github.com/user-attachments/assets/faca1a0c-1f41-4d29-9b4a-4f5bab12817c) ## Related issue number  ## Checks - [ ] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [ ] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: Balaji Veeramani <bveeramani@berkeley.edu>
diff --git a/python/ray/data/_internal/execution/interfaces/op_runtime_metrics.py b/python/ray/data/_internal/execution/interfaces/op_runtime_metrics.py
@@ -392,7 +392,7 @@ def __init__(self, op: "PhysicalOperator"):
         self._per_node_metrics: Dict[str, NodeMetrics] = defaultdict(NodeMetrics)
         self._per_node_metrics_enabled: bool = op.data_context.enable_per_node_metrics
 
-        self._cum_rss_bytes: Optional[int] = None
+        self._cum_max_uss_bytes: Optional[int] = None
 
     @property
     def extra_metrics(self) -> Dict[str, Any]:
@@ -542,17 +542,17 @@ def average_bytes_outputs_per_task(self) -> Optional[float]:
             return self.bytes_outputs_of_finished_tasks / self.num_tasks_finished
 
     @metric_property(
-        description="Average RSS usage of tasks.",
+        description="Average USS usage of tasks.",
         metrics_group=MetricsGroup.TASKS,
         map_only=True,
     )
-    def average_memory_usage_per_task(self) -> Optional[float]:
-        """Average RSS usage of tasks."""
-        if self._cum_rss_bytes is None:
+    def average_max_uss_per_task(self) -> Optional[float]:
+        """Average max USS usage of tasks."""
+        if self._cum_max_uss_bytes is None:
             return None
         else:
             assert self.num_task_outputs_generated > 0, self.num_task_outputs_generated
-            return self._cum_rss_bytes / self.num_task_outputs_generated
+            return self._cum_max_uss_bytes / self.num_task_outputs_generated
 
     def on_input_received(self, input: RefBundle):
         """Callback when the operator receives a new input."""
@@ -639,13 +639,13 @@ def on_task_output_generated(self, task_index: int, output: RefBundle):
             assert meta.num_rows is not None
             self.rows_task_outputs_generated += meta.num_rows
             trace_allocation(block_ref, "operator_output")
-            if meta.exec_stats.rss_bytes is not None:
-                if self._cum_rss_bytes is None:
-                    self._cum_rss_bytes = meta.exec_stats.rss_bytes
+            if meta.exec_stats.max_uss_bytes is not None:
+                if self._cum_max_uss_bytes is None:
+                    self._cum_max_uss_bytes = meta.exec_stats.max_uss_bytes
                 else:
-                    self._cum_rss_bytes += meta.exec_stats.rss_bytes
+                    self._cum_max_uss_bytes += meta.exec_stats.max_uss_bytes
             else:
-                assert not self._is_map, "Map operators should collect RSS metrics"
+                assert not self._is_map, "Map operators should collect memory metrics"
 
         # Update per node metrics
         if self._per_node_metrics_enabled:
diff --git a/python/ray/data/_internal/execution/operators/map_operator.py b/python/ray/data/_internal/execution/operators/map_operator.py
@@ -49,9 +49,9 @@
 from ray.data.block import (
     Block,
     BlockAccessor,
-    BlockExecStats,
     BlockMetadata,
     BlockStats,
+    _BlockExecStatsBuilder,
     to_stats,
 )
 from ray.data.context import DataContext
@@ -530,17 +530,17 @@ def _map_task(
     """
     DataContext._set_current(data_context)
     ctx.kwargs.update(kwargs)
-    stats = BlockExecStats.builder()
     map_transformer.set_target_max_block_size(ctx.target_max_block_size)
-    for b_out in map_transformer.apply_transform(iter(blocks), ctx):
-        # TODO(Clark): Add input file propagation from input blocks.
-        m_out = BlockAccessor.for_block(b_out).get_metadata()
-        m_out.exec_stats = stats.build()
-        m_out.exec_stats.udf_time_s = map_transformer.udf_time()
-        m_out.exec_stats.task_idx = ctx.task_idx
-        yield b_out
-        yield m_out
-        stats = BlockExecStats.builder()
+    with _BlockExecStatsBuilder(data_context.memory_poll_interval_s) as stats:
+        for b_out in map_transformer.apply_transform(iter(blocks), ctx):
+            # TODO(Clark): Add input file propagation from input blocks.
+            m_out = BlockAccessor.for_block(b_out).get_metadata()
+            m_out.exec_stats = stats.build()
+            m_out.exec_stats.udf_time_s = map_transformer.udf_time()
+            m_out.exec_stats.task_idx = ctx.task_idx
+            yield b_out
+            yield m_out
+            stats.reset()
 
 
 class _BlockRefBundler:
diff --git a/python/ray/data/_internal/stats.py b/python/ray/data/_internal/stats.py
@@ -1,4 +1,5 @@
 import collections
+import enum
 import logging
 import threading
 import time
@@ -7,7 +8,6 @@
 from dataclasses import dataclass, fields
 from typing import Any, Dict, List, Mapping, Optional, Set, Tuple, Union
 from uuid import uuid4
-import enum
 
 import numpy as np
 
@@ -1301,7 +1301,7 @@ def from_block_metadata(
             }
 
             memory_stats_mb = [
-                round(e.rss_bytes / (1024 * 1024), 2) for e in exec_stats
+                round(e.max_uss_bytes / (1024 * 1024), 2) for e in exec_stats
             ]
             memory_stats = {
                 "min": min(memory_stats_mb),
diff --git a/python/ray/data/block.py b/python/ray/data/block.py
@@ -1,6 +1,7 @@
 import collections
 import logging
 import os
+import threading
 import time
 from dataclasses import asdict, dataclass, fields
 from enum import Enum
@@ -134,7 +135,9 @@ def __init__(self):
         self.udf_time_s: Optional[float] = 0
         self.cpu_time_s: Optional[float] = None
         self.node_id = ray.runtime_context.get_runtime_context().get_node_id()
-        self.rss_bytes: int = 0
+        # An estimate of the maximum amount of physical memory that the process was
+        # using while computing this block.
+        self.max_uss_bytes: int = 0
         self.task_idx: Optional[int] = None
 
     @staticmethod
@@ -153,30 +156,99 @@ def __repr__(self):
 
 
 class _BlockExecStatsBuilder:
-    """Helper class for building block stats.
+    """Helper context manager for building block stats.
 
     When this class is created, we record the start time. When build() is
     called, the time delta is saved as part of the stats.
     """
 
-    def __init__(self):
-        self.start_time = time.perf_counter()
-        self.start_cpu = time.process_time()
+    def __init__(self, poll_interval_s: Optional[float] = None):
+        """
+
+        Args:
+            poll_interval_s: The interval to poll the USS of the process. If `None`,
+                this class won't poll the USS.
+        """
+        self._poll_interval_s = poll_interval_s
+
+        # Record start times.
+        self._start_time = time.perf_counter()
+        self._start_cpu = time.process_time()
+
+        # Record initial USS.
+        self._process = psutil.Process(os.getpid())
+        self._max_uss = self._estimate_uss()
+        self._max_uss_lock = threading.Lock()
+
+        self._uss_poll_thread = None
+        self._stop_uss_poll_event = None
+
+    def __enter__(self):
+        if self._poll_interval_s is not None:
+            (
+                self._uss_poll_thread,
+                self._stop_uss_poll_event,
+            ) = self._start_uss_poll_thread()
+
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self._uss_poll_thread is not None:
+            self._stop_uss_poll_thread()
 
     def build(self) -> "BlockExecStats":
-        self.end_time = time.perf_counter()
-        self.end_cpu = time.process_time()
+        # Record end times.
+        end_time = time.perf_counter()
+        end_cpu = time.process_time()
+
+        # Record max USS.
+        with self._max_uss_lock:
+            self._max_uss = max(self._max_uss, self._estimate_uss())
 
+        # Build the stats.
         stats = BlockExecStats()
-        stats.start_time_s = self.start_time
-        stats.end_time_s = self.end_time
-        stats.wall_time_s = self.end_time - self.start_time
-        stats.cpu_time_s = self.end_cpu - self.start_cpu
-        process = psutil.Process(os.getpid())
-        stats.rss_bytes = int(process.memory_info().rss)
+        stats.start_time_s = self._start_time
+        stats.end_time_s = end_time
+        stats.wall_time_s = end_time - self._start_time
+        stats.cpu_time_s = end_cpu - self._start_cpu
+        stats.max_uss_bytes = self._max_uss
 
         return stats
 
+    def reset(self):
+        self._start_time = time.perf_counter()
+        self._start_cpu = time.process_time()
+        with self._max_uss_lock:
+            self._max_uss = self._estimate_uss()
+
+    def _start_uss_poll_thread(self) -> Tuple[threading.Thread, threading.Event]:
+        assert self._poll_interval_s is not None
+
+        stop_event = threading.Event()
+
+        def poll_uss():
+            while not stop_event.is_set():
+                with self._max_uss_lock:
+                    self._max_uss = max(self._max_uss, self._estimate_uss())
+                stop_event.wait(self._poll_interval_s)
+
+        thread = threading.Thread(target=poll_uss, daemon=True)
+        thread.start()
+
+        return thread, stop_event
+
+    def _stop_uss_poll_thread(self):
+        if self._stop_uss_poll_event is not None:
+            self._stop_uss_poll_event.set()
+            self._uss_poll_thread.join()
+
+    def _estimate_uss(self) -> int:
+        memory_info = self._process.memory_info()
+        # Estimate the USS (the amount of memory that'd be free if we killed the
+        # process right now) as the difference between the RSS (total physical memory)
+        # and amount of shared physical memory.
+        return memory_info.rss - memory_info.shared
+
 
 @DeveloperAPI
 @dataclass
diff --git a/python/ray/data/context.py b/python/ray/data/context.py
@@ -323,6 +323,8 @@ class DataContext:
             transient errors when reading from remote storage systems.
         enable_per_node_metrics: Enable per node metrics reporting for Ray Data,
             disabled by default.
+        memory_poll_interval_s: The interval to poll the USS of map tasks. If `None`,
+            map tasks won't record memory stats.
     """
 
     target_max_block_size: int = DEFAULT_TARGET_MAX_BLOCK_SIZE
@@ -395,6 +397,7 @@ class DataContext:
     )
     enable_per_node_metrics: bool = DEFAULT_ENABLE_PER_NODE_METRICS
     override_object_store_memory_limit_fraction: float = None
+    memory_poll_interval_s: Optional[float] = 1
 
     def __post_init__(self):
         # The additonal ray remote args that should be added to
diff --git a/python/ray/data/tests/conftest.py b/python/ray/data/tests/conftest.py
@@ -418,7 +418,7 @@ def op_two_block():
     block_params = {
         "num_rows": [10000, 5000],
         "size_bytes": [100, 50],
-        "rss_bytes": [1024 * 1024 * 2, 1024 * 1024 * 1],
+        "uss_bytes": [1024 * 1024 * 2, 1024 * 1024 * 1],
         "wall_time": [5, 10],
         "cpu_time": [1.2, 3.4],
         "udf_time": [1.1, 1.7],
@@ -439,7 +439,7 @@ def op_two_block():
         block_exec_stats.cpu_time_s = block_params["cpu_time"][i]
         block_exec_stats.udf_time_s = block_params["udf_time"][i]
         block_exec_stats.node_id = block_params["node_id"][i]
-        block_exec_stats.rss_bytes = block_params["rss_bytes"][i]
+        block_exec_stats.max_uss_bytes = block_params["uss_bytes"][i]
         block_exec_stats.task_idx = block_params["task_idx"][i]
         block_meta_list.append(
             BlockMetadata(
diff --git a/python/ray/data/tests/test_op_runtime_metrics.py b/python/ray/data/tests/test_op_runtime_metrics.py
@@ -9,15 +9,15 @@
 from ray.data.block import BlockExecStats, BlockMetadata
 
 
-def test_average_memory_usage_per_task():
+def test_average_max_uss_per_task():
     # No tasks submitted yet.
     metrics = OpRuntimeMetrics(MagicMock())
-    assert metrics.average_memory_usage_per_task is None
+    assert metrics.average_max_uss_per_task is None
 
-    def create_bundle(rss_bytes: int):
+    def create_bundle(uss_bytes: int):
         block = ray.put(pa.Table.from_pydict({}))
         stats = BlockExecStats()
-        stats.rss_bytes = rss_bytes
+        stats.max_uss_bytes = uss_bytes
         stats.wall_time_s = 0
         metadata = BlockMetadata(
             num_rows=0,
@@ -29,20 +29,20 @@ def create_bundle(rss_bytes: int):
         return RefBundle([(block, metadata)], owns_blocks=False)
 
     # Submit two tasks.
-    bundle = create_bundle(rss_bytes=0)
+    bundle = create_bundle(uss_bytes=0)
     metrics.on_task_submitted(0, bundle)
     metrics.on_task_submitted(1, bundle)
-    assert metrics.average_memory_usage_per_task is None
+    assert metrics.average_max_uss_per_task is None
 
     # Generate one output for the first task.
-    bundle = create_bundle(rss_bytes=1)
+    bundle = create_bundle(uss_bytes=1)
     metrics.on_task_output_generated(0, bundle)
-    assert metrics.average_memory_usage_per_task == 1
+    assert metrics.average_max_uss_per_task == 1
 
     # Generate one output for the second task.
-    bundle = create_bundle(rss_bytes=3)
+    bundle = create_bundle(uss_bytes=3)
     metrics.on_task_output_generated(0, bundle)
-    assert metrics.average_memory_usage_per_task == 2  # (1 + 3) / 2 = 2
+    assert metrics.average_max_uss_per_task == 2  # (1 + 3) / 2 = 2
 
 
 if __name__ == "__main__":
diff --git a/python/ray/data/tests/test_stats.py b/python/ray/data/tests/test_stats.py