@@ -99,35 +99,30 @@ def test_starccm(
99
99
100
100
# Copy additional files in advanced to avoid conflict when running 8 and 16 nodes tests in parallel
101
101
remote_command_executor ._copy_additional_files ([str (test_datadir / "starccm.slurm.sh" )])
102
- # Run 8 and 16 node tests in parallel
103
- result_8 = remote_command_executor .run_remote_command (
104
- f'sbatch --ntasks={ number_of_nodes [0 ] * TASK_VCPUS } starccm.slurm.sh "{ podkey } " "{ licpath } "'
105
- )
106
- logging .info (f"Submitting StarCCM+ job with { number_of_nodes [0 ]} nodes" )
107
- result_16 = remote_command_executor .run_remote_command (
108
- f'sbatch --ntasks={ number_of_nodes [1 ] * TASK_VCPUS } starccm.slurm.sh "{ podkey } " "{ licpath } "'
109
- )
110
- logging .info (f"Submitting StarCCM+ job with { number_of_nodes [1 ]} nodes" )
111
- observed_value_8 = calculate_observed_value (
112
- result_8 , remote_command_executor , scheduler_commands , test_datadir , number_of_nodes [0 ]
113
- )
114
- observed_value_16 = calculate_observed_value (
115
- result_16 , remote_command_executor , scheduler_commands , test_datadir , number_of_nodes [1 ]
116
- )
117
102
118
- # Run 32 node test
119
- result_32 = remote_command_executor .run_remote_command (
120
- f'sbatch --ntasks={ number_of_nodes [2 ] * TASK_VCPUS } starccm.slurm.sh "{ podkey } " "{ licpath } "'
121
- )
122
- logging .info (f"Submitting StarCCM+ job with { number_of_nodes [2 ]} nodes" )
123
- observed_value_32 = calculate_observed_value (
124
- result_32 , remote_command_executor , scheduler_commands , test_datadir , number_of_nodes [2 ]
125
- )
103
+ max_node_num = max (number_of_nodes )
104
+ final_result = []
105
+ for num_of_nodes in number_of_nodes :
106
+ parallelism = int (max_node_num / num_of_nodes )
107
+ result = []
108
+ logging .info (f"Submitting StarCCM+ job with { num_of_nodes } nodes" )
109
+ run_command = f'sbatch --ntasks={ num_of_nodes * TASK_VCPUS } starccm.slurm.sh "{ podkey } " "{ licpath } "'
110
+ multiple_runs = []
111
+ # Run at least twice up to whatever parallelism allows to maximize usage of available nodes
112
+ number_of_runs = max (parallelism , 2 )
113
+ for _ in range (number_of_runs ):
114
+ multiple_runs .append (remote_command_executor .run_remote_command (run_command ))
115
+ for run in multiple_runs :
116
+ result .append (
117
+ calculate_observed_value (run , remote_command_executor , scheduler_commands , test_datadir , num_of_nodes )
118
+ )
119
+ final_result .append ((num_of_nodes , sum (result ) / len (result ))) # Use average to reduce noise of each runs.
120
+ logging .info (f"Finished StarCCM+ job with { num_of_nodes } nodes" )
121
+
122
+ push_result_to_dynamodb ("StarCCM" , final_result , instance , os )
126
123
127
124
# Check results and log performance degradation
128
- result = list (zip (number_of_nodes , [observed_value_8 , observed_value_16 , observed_value_32 ]))
129
- push_result_to_dynamodb ("StarCCM" , result , instance , os )
130
- for node , observed_value in result :
125
+ for node , observed_value in final_result :
131
126
baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS [os ][node ]
132
127
_log_output_performance_difference (node , performance_degradation , observed_value , baseline_value )
133
128
0 commit comments