Skip to content

Commit 5d9e661

Browse files
authored
[data] refine backpressure info on progress bar (#51697)
## Why are these changes needed? Refine backpressure info on progress bar to avoid confusion * Add backpresure types * Add remaining budgets for reading task outputs for the debug message. Example: ``` Running Dataset. Active & requested resources: 2.1/12 CPU, 800.0MB/1.0GB object store: : 28.0 row [00:20, 1.26s/ row] - ReadRange->Map(map1): Tasks: 1 [backpressured:tasks,outputs]; Queued blocks: 99; Resources: 1.0 CPU, 640.0MB object store (in=320.0MB,out=320.0MB), budget=(cpu=5.0,gpu=inf,obj_store=0.0B,out=0.0B): : 30.0 row [00:19, 1.44 row/s] - Map(map2): Tasks: 1 [backpressured:tasks]; Queued blocks: 1; Resources: 1.1 CPU, 160.0MB object store (in=160.0MB,out=0.0B), budget=(cpu=4.9,gpu=inf,obj_store=96.0MB,out=224.0MB): : 26.0 row [00:19, 1.29 row/s] ``` ## Related issue number <!-- For example: "Closes #1234" --> --------- Signed-off-by: Hao Chen <chenh1024@gmail.com>
1 parent b7dae2a commit 5d9e661

File tree

2 files changed

+18
-2
lines changed

2 files changed

+18
-2
lines changed

python/ray/data/_internal/execution/resource_manager.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,12 @@ def get_op_usage_str(self, op: PhysicalOperator) -> str:
250250
budget = self._op_resource_allocator._op_budgets[op]
251251
usage_str += f", budget=(cpu={budget.cpu:.1f}"
252252
usage_str += f",gpu={budget.gpu:.1f}"
253-
usage_str += f",object store={budget.object_store_memory_str()})"
253+
usage_str += f",obj_store={budget.object_store_memory_str()}"
254+
# Remaining memory budget for producing new task outputs.
255+
reserved_for_output = memory_string(
256+
self._op_resource_allocator._output_budgets.get(op, 0)
257+
)
258+
usage_str += f",out={reserved_for_output})"
254259
return usage_str
255260

256261
def op_resource_allocator_enabled(self) -> bool:
@@ -405,6 +410,8 @@ def __init__(self, resource_manager: ResourceManager, reservation_ratio: float):
405410
self._total_shared = ExecutionResources.zero()
406411
# Resource budgets for each operator, excluding `_reserved_for_op_outputs`.
407412
self._op_budgets: Dict[PhysicalOperator, ExecutionResources] = {}
413+
# Remaining memory budget for generating new task outputs, per operator.
414+
self._output_budgets: Dict[PhysicalOperator, float] = {}
408415
# Whether each operator has reserved the minimum resources to run
409416
# at least one task.
410417
# This is used to avoid edge cases where the entire resource limits are not
@@ -553,12 +560,14 @@ def max_task_output_bytes_to_read(self, op: PhysicalOperator) -> Optional[int]:
553560
op_outputs_usage = self._get_op_outputs_usage_with_downstream(op)
554561
res += max(self._reserved_for_op_outputs[op] - op_outputs_usage, 0)
555562
if math.isinf(res):
563+
self._output_budgets[op] = res
556564
return None
557565

558566
res = int(res)
559567
assert res >= 0
560568
if res == 0 and self._should_unblock_streaming_output_backpressure(op):
561569
res = 1
570+
self._output_budgets[op] = res
562571
return res
563572

564573
def _get_downstream_ineligible_ops(

python/ray/data/_internal/execution/streaming_executor_state.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,14 @@ def summary_str(self, resource_manager: ResourceManager) -> str:
272272
self.op._in_task_submission_backpressure
273273
or self.op._in_task_output_backpressure
274274
):
275-
desc += " [backpressured]"
275+
backpressure_types = []
276+
if self.op._in_task_submission_backpressure:
277+
# The op is backpressured from submitting new tasks.
278+
backpressure_types.append("tasks")
279+
if self.op._in_task_output_backpressure:
280+
# The op is backpressured from producing new outputs.
281+
backpressure_types.append("outputs")
282+
desc += f" [backpressured:{','.join(backpressure_types)}]"
276283

277284
# Actors info
278285
desc += self.op.actor_info_progress_str()

0 commit comments

Comments
 (0)