@@ -74,6 +74,9 @@ def gen_expected_metrics(
74
74
"'task_submission_backpressure_time': "
75
75
f"{ 'N' if task_backpressure else 'Z' } "
76
76
),
77
+ "'num_alive_actors': Z" ,
78
+ "'num_restarting_actors': Z" ,
79
+ "'num_pending_actors': Z" ,
77
80
"'obj_store_mem_internal_inqueue_blocks': Z" ,
78
81
"'obj_store_mem_internal_outqueue_blocks': Z" ,
79
82
"'obj_store_mem_freed': N" ,
@@ -94,6 +97,9 @@ def gen_expected_metrics(
94
97
"'task_submission_backpressure_time': "
95
98
f"{ 'N' if task_backpressure else 'Z' } "
96
99
),
100
+ "'num_alive_actors': Z" ,
101
+ "'num_restarting_actors': Z" ,
102
+ "'num_pending_actors': Z" ,
97
103
"'obj_store_mem_internal_inqueue_blocks': Z" ,
98
104
"'obj_store_mem_internal_outqueue_blocks': Z" ,
99
105
"'obj_store_mem_used': A" ,
@@ -566,6 +572,9 @@ def test_dataset__repr__(ray_start_regular_shared, restore_data_context):
566
572
" num_tasks_failed: Z,\n "
567
573
" block_generation_time: N,\n "
568
574
" task_submission_backpressure_time: N,\n "
575
+ " num_alive_actors: Z,\n "
576
+ " num_restarting_actors: Z,\n "
577
+ " num_pending_actors: Z,\n "
569
578
" obj_store_mem_internal_inqueue_blocks: Z,\n "
570
579
" obj_store_mem_internal_outqueue_blocks: Z,\n "
571
580
" obj_store_mem_freed: N,\n "
@@ -681,6 +690,9 @@ def check_stats():
681
690
" num_tasks_failed: Z,\n "
682
691
" block_generation_time: N,\n "
683
692
" task_submission_backpressure_time: N,\n "
693
+ " num_alive_actors: Z,\n "
694
+ " num_restarting_actors: Z,\n "
695
+ " num_pending_actors: Z,\n "
684
696
" obj_store_mem_internal_inqueue_blocks: Z,\n "
685
697
" obj_store_mem_internal_outqueue_blocks: Z,\n "
686
698
" obj_store_mem_freed: N,\n "
@@ -751,6 +763,9 @@ def check_stats():
751
763
" num_tasks_failed: Z,\n "
752
764
" block_generation_time: N,\n "
753
765
" task_submission_backpressure_time: N,\n "
766
+ " num_alive_actors: Z,\n "
767
+ " num_restarting_actors: Z,\n "
768
+ " num_pending_actors: Z,\n "
754
769
" obj_store_mem_internal_inqueue_blocks: Z,\n "
755
770
" obj_store_mem_internal_outqueue_blocks: Z,\n "
756
771
" obj_store_mem_freed: N,\n "
@@ -1370,6 +1385,104 @@ def time_to_seconds(time_str):
1370
1385
assert isclose (percent , time_s / total_time * 100 , rel_tol = 0.01 )
1371
1386
1372
1387
1388
+ def test_per_node_metrics_basic (ray_start_regular_shared , restore_data_context ):
1389
+ """Basic test to ensure per-node metrics are populated."""
1390
+ ctx = DataContext .get_current ()
1391
+ ctx .enable_per_node_metrics = True
1392
+
1393
+ def _sum_net_metrics (per_node_metrics : Dict [str , NodeMetrics ]) -> Dict [str , float ]:
1394
+ sum_metrics = defaultdict (float )
1395
+ for metrics in per_node_metrics .values ():
1396
+ for metric , value in metrics .items ():
1397
+ sum_metrics [metric ] += value
1398
+ return sum_metrics
1399
+
1400
+ with patch ("ray.data._internal.stats.StatsManager._stats_actor" ) as mock_get_actor :
1401
+ mock_actor_handle = MagicMock ()
1402
+ mock_get_actor .return_value = mock_actor_handle
1403
+
1404
+ ds = ray .data .range (20 ).map_batches (lambda batch : batch ).materialize ()
1405
+ metrics = ds ._plan .stats ().extra_metrics
1406
+
1407
+ calls = mock_actor_handle .update_execution_metrics .remote .call_args_list
1408
+ assert len (calls ) > 0
1409
+
1410
+ last_args , _ = calls [- 1 ]
1411
+ per_node_metrics = last_args [- 1 ]
1412
+
1413
+ assert isinstance (per_node_metrics , dict )
1414
+ assert len (per_node_metrics ) >= 1
1415
+
1416
+ for nm in per_node_metrics .values ():
1417
+ for f in fields (NodeMetrics ):
1418
+ assert f .name in nm
1419
+
1420
+ # basic checks to make sure metrics are populated
1421
+ assert any (nm ["num_tasks_finished" ] > 0 for nm in per_node_metrics .values ())
1422
+ assert any (
1423
+ nm ["bytes_outputs_of_finished_tasks" ] > 0
1424
+ for nm in per_node_metrics .values ()
1425
+ )
1426
+ assert any (
1427
+ nm ["blocks_outputs_of_finished_tasks" ] > 0
1428
+ for nm in per_node_metrics .values ()
1429
+ )
1430
+
1431
+ net_metrics = _sum_net_metrics (per_node_metrics )
1432
+ assert net_metrics ["num_tasks_finished" ] == metrics ["num_tasks_finished" ]
1433
+ assert (
1434
+ net_metrics ["bytes_outputs_of_finished_tasks" ]
1435
+ == metrics ["bytes_outputs_of_finished_tasks" ]
1436
+ )
1437
+
1438
+
1439
+ @pytest .mark .parametrize ("enable_metrics" , [True , False ])
1440
+ def test_per_node_metrics_toggle (
1441
+ ray_start_regular_shared , restore_data_context , enable_metrics
1442
+ ):
1443
+ ctx = DataContext .get_current ()
1444
+ ctx .enable_per_node_metrics = enable_metrics
1445
+
1446
+ with patch ("ray.data._internal.stats.StatsManager._stats_actor" ) as mock_get_actor :
1447
+ mock_actor_handle = MagicMock ()
1448
+ mock_get_actor .return_value = mock_actor_handle
1449
+
1450
+ ray .data .range (10000 ).map (lambda x : x ).materialize ()
1451
+
1452
+ calls = mock_actor_handle .update_execution_metrics .remote .call_args_list
1453
+ assert len (calls ) > 0
1454
+
1455
+ last_args , _ = calls [- 1 ]
1456
+ per_node_metrics = last_args [- 1 ]
1457
+
1458
+ if enable_metrics :
1459
+ assert per_node_metrics is not None
1460
+ else :
1461
+ assert per_node_metrics is None
1462
+
1463
+
1464
+ def test_task_duration_stats ():
1465
+ """Test that OpTaskDurationStats correctly tracks running statistics using Welford's algorithm."""
1466
+ stats = TaskDurationStats ()
1467
+
1468
+ # Test initial state
1469
+ assert stats .count () == 0
1470
+ assert stats .mean () == 0.0
1471
+ assert stats .stddev () == 0.0
1472
+
1473
+ # Add some task durations and verify stats
1474
+ durations = [2.0 , 4.0 , 4.0 , 4.0 , 5.0 , 5.0 , 7.0 , 9.0 ]
1475
+ for d in durations :
1476
+ stats .add_duration (d )
1477
+
1478
+ # Compare with numpy's implementations
1479
+ assert stats .count () == len (durations )
1480
+ assert pytest .approx (stats .mean ()) == np .mean (durations )
1481
+ assert pytest .approx (stats .stddev ()) == np .std (
1482
+ durations , ddof = 1
1483
+ ) # ddof=1 for sample standard deviation
1484
+
1485
+
1373
1486
# NOTE: All tests above share a Ray cluster, while the tests below do not. These
1374
1487
# tests should only be carefully reordered to retain this invariant!
1375
1488
@@ -1709,104 +1822,6 @@ def update_stats_manager(i):
1709
1822
wait_for_condition (lambda : not StatsManager ._update_thread .is_alive ())
1710
1823
1711
1824
1712
- def test_per_node_metrics_basic (ray_start_regular_shared , restore_data_context ):
1713
- """Basic test to ensure per-node metrics are populated."""
1714
- ctx = DataContext .get_current ()
1715
- ctx .enable_per_node_metrics = True
1716
-
1717
- def _sum_net_metrics (per_node_metrics : Dict [str , NodeMetrics ]) -> Dict [str , float ]:
1718
- sum_metrics = defaultdict (float )
1719
- for metrics in per_node_metrics .values ():
1720
- for metric , value in metrics .items ():
1721
- sum_metrics [metric ] += value
1722
- return sum_metrics
1723
-
1724
- with patch ("ray.data._internal.stats.StatsManager._stats_actor" ) as mock_get_actor :
1725
- mock_actor_handle = MagicMock ()
1726
- mock_get_actor .return_value = mock_actor_handle
1727
-
1728
- ds = ray .data .range (20 ).map_batches (lambda batch : batch ).materialize ()
1729
- metrics = ds ._plan .stats ().extra_metrics
1730
-
1731
- calls = mock_actor_handle .update_execution_metrics .remote .call_args_list
1732
- assert len (calls ) > 0
1733
-
1734
- last_args , _ = calls [- 1 ]
1735
- per_node_metrics = last_args [- 1 ]
1736
-
1737
- assert isinstance (per_node_metrics , dict )
1738
- assert len (per_node_metrics ) >= 1
1739
-
1740
- for nm in per_node_metrics .values ():
1741
- for f in fields (NodeMetrics ):
1742
- assert f .name in nm
1743
-
1744
- # basic checks to make sure metrics are populated
1745
- assert any (nm ["num_tasks_finished" ] > 0 for nm in per_node_metrics .values ())
1746
- assert any (
1747
- nm ["bytes_outputs_of_finished_tasks" ] > 0
1748
- for nm in per_node_metrics .values ()
1749
- )
1750
- assert any (
1751
- nm ["blocks_outputs_of_finished_tasks" ] > 0
1752
- for nm in per_node_metrics .values ()
1753
- )
1754
-
1755
- net_metrics = _sum_net_metrics (per_node_metrics )
1756
- assert net_metrics ["num_tasks_finished" ] == metrics ["num_tasks_finished" ]
1757
- assert (
1758
- net_metrics ["bytes_outputs_of_finished_tasks" ]
1759
- == metrics ["bytes_outputs_of_finished_tasks" ]
1760
- )
1761
-
1762
-
1763
- @pytest .mark .parametrize ("enable_metrics" , [True , False ])
1764
- def test_per_node_metrics_toggle (
1765
- ray_start_regular_shared , restore_data_context , enable_metrics
1766
- ):
1767
- ctx = DataContext .get_current ()
1768
- ctx .enable_per_node_metrics = enable_metrics
1769
-
1770
- with patch ("ray.data._internal.stats.StatsManager._stats_actor" ) as mock_get_actor :
1771
- mock_actor_handle = MagicMock ()
1772
- mock_get_actor .return_value = mock_actor_handle
1773
-
1774
- ray .data .range (10000 ).map (lambda x : x ).materialize ()
1775
-
1776
- calls = mock_actor_handle .update_execution_metrics .remote .call_args_list
1777
- assert len (calls ) > 0
1778
-
1779
- last_args , _ = calls [- 1 ]
1780
- per_node_metrics = last_args [- 1 ]
1781
-
1782
- if enable_metrics :
1783
- assert per_node_metrics is not None
1784
- else :
1785
- assert per_node_metrics is None
1786
-
1787
-
1788
- def test_task_duration_stats ():
1789
- """Test that OpTaskDurationStats correctly tracks running statistics using Welford's algorithm."""
1790
- stats = TaskDurationStats ()
1791
-
1792
- # Test initial state
1793
- assert stats .count () == 0
1794
- assert stats .mean () == 0.0
1795
- assert stats .stddev () == 0.0
1796
-
1797
- # Add some task durations and verify stats
1798
- durations = [2.0 , 4.0 , 4.0 , 4.0 , 5.0 , 5.0 , 7.0 , 9.0 ]
1799
- for d in durations :
1800
- stats .add_duration (d )
1801
-
1802
- # Compare with numpy's implementations
1803
- assert stats .count () == len (durations )
1804
- assert pytest .approx (stats .mean ()) == np .mean (durations )
1805
- assert pytest .approx (stats .stddev ()) == np .std (
1806
- durations , ddof = 1
1807
- ) # ddof=1 for sample standard deviation
1808
-
1809
-
1810
1825
if __name__ == "__main__" :
1811
1826
import sys
1812
1827
0 commit comments