@@ -180,6 +180,17 @@ def get_test_server_spec(self) -> ServerSpec:
180
180
],
181
181
)
182
182
183
+ def test_server_handle (self ) -> None :
184
+ unused = specs .AppState .RUNNING
185
+
186
+ server = ServerSpec (name = "foo" , scheduler = "slurm" , meshes = [], state = unused )
187
+ self .assertEqual ("slurm:///foo" , server .server_handle )
188
+
189
+ server = ServerSpec (
190
+ name = "foo" , scheduler = "slurm" , namespace = "prod" , meshes = [], state = unused
191
+ )
192
+ self .assertEqual ("slurm://prod/foo" , server .server_handle )
193
+
183
194
def test_get_mesh_spec (self ) -> None :
184
195
server_spec = self .get_test_server_spec ()
185
196
mesh_spec = server_spec .get_mesh_spec ("trainer" )
@@ -196,3 +207,55 @@ def test_get_mesh_spec_not_found(self) -> None:
196
207
r"Mesh: 'worker' not found in job: monarch-foo-1a2b3c. Try one of: \['trainer', 'generator'\]" ,
197
208
):
198
209
server_spec .get_mesh_spec ("worker" )
210
+
211
+ def _1_mesh_2_host_server_spec (self , state : specs .AppState ) -> ServerSpec :
212
+ return ServerSpec (
213
+ name = "foo" ,
214
+ scheduler = "slurm" ,
215
+ meshes = [
216
+ MeshSpec (
217
+ name = "trainer" ,
218
+ num_hosts = 2 ,
219
+ hostnames = ["compute-node-0" , "compute-node-1" ],
220
+ )
221
+ ],
222
+ state = state ,
223
+ )
224
+
225
+ def test_node0 (self ) -> None :
226
+ server = self ._1_mesh_2_host_server_spec (specs .AppState .RUNNING )
227
+ self .assertEqual ("compute-node-0" , server .host0 ("trainer" ))
228
+
229
+ def test_node0_server_in_terminal_state (self ) -> None :
230
+ for terminal_state in [
231
+ specs .AppState .FAILED ,
232
+ specs .AppState .SUCCEEDED ,
233
+ specs .AppState .CANCELLED ,
234
+ ]:
235
+ with self .subTest (terminal_state = terminal_state ):
236
+ server = self ._1_mesh_2_host_server_spec (terminal_state )
237
+ with self .assertRaisesRegex (
238
+ RuntimeError ,
239
+ r"Use `monarch.tools.commands.get_or_create\(\)` to create a new server" ,
240
+ ):
241
+ server .host0 ("trainer" )
242
+
243
+ def test_node0_server_in_pending_state (self ) -> None :
244
+ for pending_state in [specs .AppState .SUBMITTED , specs .AppState .PENDING ]:
245
+ with self .subTest (pending_state = pending_state ):
246
+ server = self ._1_mesh_2_host_server_spec (pending_state )
247
+ with self .assertRaisesRegex (
248
+ RuntimeError ,
249
+ r"Use `monarch.tools.commands.server_ready\(\)` to wait for the server to be RUNNING" ,
250
+ ):
251
+ server .host0 ("trainer" )
252
+
253
+ def test_node0_server_in_illegal_tate (self ) -> None :
254
+ for illegal_state in [specs .AppState .UNSUBMITTED , specs .AppState .UNKNOWN ]:
255
+ with self .subTest (illegal_state = illegal_state ):
256
+ server = self ._1_mesh_2_host_server_spec (illegal_state )
257
+ with self .assertRaisesRegex (
258
+ RuntimeError ,
259
+ r"Please report this as a bug" ,
260
+ ):
261
+ server .host0 ("trainer" )
0 commit comments