[integ-tests] Skip Insufficient capacity error when checking logs for all tests

hanwen-cluster · hanwen-cluster · commit 3eb99d2558bf · 2025-07-07T11:30:29.000-07:00
This commit will reduce intermittent test failures
diff --git a/tests/integration-tests/benchmarks/test_scaling_performance.py b/tests/integration-tests/benchmarks/test_scaling_performance.py
@@ -72,4 +72,4 @@ def test_scaling_performance(
     )
     assert_that(max(compute_nodes_time_series)).is_equal_to(benchmark_params["scaling_target"])
     assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
-    assert_no_errors_in_logs(remote_command_executor, scheduler)
+    assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)
diff --git a/tests/integration-tests/benchmarks/test_scheduler_performance.py b/tests/integration-tests/benchmarks/test_scheduler_performance.py
@@ -76,7 +76,7 @@ def test_scheduler_performance(
     assert_that(max(compute_nodes_time_series)).is_equal_to(benchmark_params["scaling_target"])
     assert_that(compute_nodes_time_series[-1]).is_equal_to(0)
     _assert_jobs_completed(remote_command_executor, benchmark_params["jobs_to_submit"])
-    assert_no_errors_in_logs(remote_command_executor, scheduler)
+    assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)
 
 
 def _submit_jobs(benchmark_params, scheduler_commands, cluster, scheduler_commands_factory):
diff --git a/tests/integration-tests/tests/basic/test_essential_features.py b/tests/integration-tests/tests/basic/test_essential_features.py
@@ -356,5 +356,5 @@ def _test_disable_hyperthreading(
         default_threads_per_core=default_threads_per_core,
     )
 
-    assert_no_errors_in_logs(remote_command_executor, scheduler)
+    assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)
     run_system_analyzer(cluster, scheduler_commands_factory, request)
diff --git a/tests/integration-tests/tests/cli_commands/test_cli_commands.py b/tests/integration-tests/tests/cli_commands/test_cli_commands.py
@@ -97,7 +97,7 @@ def test_slurm_cli_commands(
     _test_pcluster_compute_fleet(cluster, expected_num_nodes=2)
 
     remote_command_executor = RemoteCommandExecutor(cluster)
-    assert_no_errors_in_logs(remote_command_executor, scheduler)
+    assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)
 
 
 def _test_create_cluster(clusters_factory, cluster_config, request):
diff --git a/tests/integration-tests/tests/iam/test_iam.py b/tests/integration-tests/tests/iam/test_iam.py
@@ -302,7 +302,7 @@ def test_iam_policies(region, scheduler, pcluster_config_reader, clusters_factor
     if scheduler == "awsbatch":
         _test_batch_access(remote_command_executor, region)
 
-    assert_no_errors_in_logs(remote_command_executor, scheduler)
+    assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)
 
 
 def _test_s3_access(remote_command_executor, region):
diff --git a/tests/integration-tests/tests/intel_hpc/test_intel_hpc.py b/tests/integration-tests/tests/intel_hpc/test_intel_hpc.py
@@ -33,7 +33,7 @@ def test_intel_hpc(
     _test_intel_instance_tags(cluster.get_cluster_instance_ids(), region)
     _test_intel_clck(remote_command_executor, scheduler_commands, test_datadir)
 
-    assert_no_errors_in_logs(remote_command_executor, scheduler)
+    assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)
 
 
 def _test_intel_instance_tags(cluster_instances, region):
diff --git a/tests/integration-tests/tests/networking/test_cluster_networking.py b/tests/integration-tests/tests/networking/test_cluster_networking.py
@@ -170,7 +170,7 @@ def test_cluster_in_no_internet_subnet(
     _run_prolog_epilog_jobs(remote_command_executor, slurm_commands)
     _run_mpi_jobs(mpi_variants, remote_command_executor, test_datadir, slurm_commands, cluster, region)
     check_pcluster_list_cluster_log_streams(cluster, os)
-    assert_no_errors_in_logs(remote_command_executor, scheduler)
+    assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)
     logging.info("Checking compute node is scaled down after scaledown idle time")
     wait_for_num_instances_in_cluster(cluster.cfn_name, region, 1)
 
diff --git a/tests/integration-tests/tests/scaling/test_scaling.py b/tests/integration-tests/tests/scaling/test_scaling.py
@@ -281,7 +281,7 @@ def _test_multiple_jobs(cluster, remote_command_executor, test_datadir, region,
     )
 
     logging.info("Verifying no error in logs")
-    assert_no_errors_in_logs(remote_command_executor, "slurm")
+    assert_no_errors_in_logs(remote_command_executor, "slurm", skip_ice=True)
 
 
 @pytest.mark.usefixtures("os", "instance", "scheduler")
diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py
@@ -125,7 +125,7 @@ def test_slurm(
 
     # Tests below must run on HeadNode or need HeadNode participate.
     head_node_command_executor = RemoteCommandExecutor(cluster)
-    assert_no_errors_in_logs(head_node_command_executor, "slurm")
+    assert_no_errors_in_logs(head_node_command_executor, "slurm", skip_ice=True)
     # Test compute node bootstrap timeout
     clustermgtd_conf_path = retrieve_clustermgtd_conf_path(head_node_command_executor)
     _test_compute_node_bootstrap_timeout(
@@ -244,7 +244,7 @@ def test_slurm_from_login_nodes_in_private_network(
     # Test torque command wrapper
     _test_torque_job_submit(remote_command_executor, test_datadir)
     head_node_command_executor = RemoteCommandExecutor(cluster)
-    assert_no_errors_in_logs(head_node_command_executor, "slurm")
+    assert_no_errors_in_logs(head_node_command_executor, "slurm", skip_ice=True)
 
 
 @pytest.mark.usefixtures("region", "os", "instance", "scheduler")
@@ -309,7 +309,7 @@ def test_slurm_scaling(
         dynamic_instance_type=instance,
         stop_max_delay_secs=stop_max_delay_secs,
     )
-    assert_no_errors_in_logs(remote_command_executor, scheduler)
+    assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)
 
 
 @pytest.mark.usefixtures("os", "instance", "scheduler")
@@ -426,7 +426,7 @@ def test_error_handling(
         num_static_nodes=1,
     )
     # Next test will introduce error in logs, assert no error now
-    assert_no_errors_in_logs(remote_command_executor, scheduler)
+    assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)
     _test_clustermgtd_down_logic(
         remote_command_executor,
         scheduler_commands,
@@ -1047,7 +1047,7 @@ def test_slurm_overrides(
         )
         assert_msg_in_log(remote_command_executor, slurm_resume_log, f"Found {api} parameters override")
 
-    assert_no_errors_in_logs(remote_command_executor, scheduler)
+    assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)
 
 
 def _assert_cluster_initial_conditions(

Original file line number	Diff line number	Diff line change
`@@ -72,4 +72,4 @@ def test_scaling_performance(`
`72`	`72`	`)`
`73`	`73`	`assert_that(max(compute_nodes_time_series)).is_equal_to(benchmark_params["scaling_target"])`
`74`	`74`	`assert_that(compute_nodes_time_series[-1]).is_equal_to(0)`
`75`		`- assert_no_errors_in_logs(remote_command_executor, scheduler)`
	`75`	`+ assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)`
Original file line number	Diff line number	Diff line change
`@@ -356,5 +356,5 @@ def _test_disable_hyperthreading(`
`356`	`356`	`default_threads_per_core=default_threads_per_core,`
`357`	`357`	`)`
`358`	`358`
`359`		`- assert_no_errors_in_logs(remote_command_executor, scheduler)`
	`359`	`+ assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)`
`360`	`360`	`run_system_analyzer(cluster, scheduler_commands_factory, request)`
Original file line number	Diff line number	Diff line change
`@@ -281,7 +281,7 @@ def _test_multiple_jobs(cluster, remote_command_executor, test_datadir, region,`
`281`	`281`	`)`
`282`	`282`
`283`	`283`	`logging.info("Verifying no error in logs")`
`284`		`- assert_no_errors_in_logs(remote_command_executor, "slurm")`
	`284`	`+ assert_no_errors_in_logs(remote_command_executor, "slurm", skip_ice=True)`
`285`	`285`
`286`	`286`
`287`	`287`	`@pytest.mark.usefixtures("os", "instance", "scheduler")`