Skip to content

Commit aa05f63

Browse files
kiukchungfacebook-github-bot
authored andcommitted
(monarch/tools) add ServerSpec.host0() method that can be used to set MASTER_ADDR for PTD (#419)
Summary: Pull Request resolved: #419 Simplifies getting host0's hostname to set as MASTER_ADDR for torch.distributed. Usage: ``` from monarch.tools.commands import get_or_create server_info = await get_or_create(...) # allocate proc mesh -> create actor (code omitted for brevity)... trainer_actor.call( MASTER_ADDR=server_info.host0("trainer") # trainer mesh's 1st host MASTER_PORT=29500, ... ) ``` Reviewed By: suo Differential Revision: D77690685 fbshipit-source-id: 1f37c370ac33282d3b9e34ee983a74acc91d68f6
1 parent 2803605 commit aa05f63

File tree

2 files changed

+111
-0
lines changed

2 files changed

+111
-0
lines changed

python/monarch/tools/mesh_spec.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from monarch.tools.network import get_sockaddr
1313
from torchx import specs
14+
from torchx.specs.api import is_terminal
1415

1516
DEFAULT_REMOTE_ALLOCATOR_PORT = 26600
1617

@@ -133,6 +134,53 @@ def server_handle(self) -> str:
133134
def is_running(self) -> bool:
134135
return self.state == specs.AppState.RUNNING
135136

137+
def host0(self, mesh_name: str) -> str:
138+
"""The hostname of the first node in the given mesh.
139+
The return value of this method can be used to set `MASTER_ADDR` env var for torch.distributed.
140+
141+
NOTE: the state of this server must be RUNNING for this method to return a valid value.
142+
143+
Usage:
144+
145+
.. code-block::python
146+
from monarch.tools.commands import get_or_create
147+
148+
server_info = await get_or_create(...)
149+
assert server_info.is_running
150+
151+
# allocate proc mesh -> create actor (code omitted for brevity)...
152+
153+
trainer_actor.call(
154+
MASTER_ADDR=server_info.host0("trainer") # trainer mesh's 1st host
155+
MASTER_PORT=29500,
156+
...
157+
)
158+
159+
NOTE: The ordering of the hostnames is exactly the same as what comes back from the underlying
160+
scheduler's `describe_job` or `list_*` API. Please find the exact semantics in the
161+
respective scheduler's implementation in https://github.com/pytorch/torchx/tree/main/torchx/schedulers.
162+
"""
163+
mesh_spec = self.get_mesh_spec(mesh_name)
164+
if self.is_running:
165+
# hostnames are only valid when the server is RUNNING
166+
if not mesh_spec.hostnames:
167+
raise RuntimeError(f"{self.server_handle} does not have any hosts")
168+
return mesh_spec.hostnames[0]
169+
elif self.state in [specs.AppState.SUBMITTED, specs.AppState.PENDING]:
170+
raise RuntimeError(
171+
f"{self.server_handle} is {self.state}."
172+
f" Use `monarch.tools.commands.server_ready()` to wait for the server to be {specs.AppState.RUNNING}"
173+
)
174+
elif is_terminal(self.state):
175+
raise RuntimeError(
176+
f"{self.server_handle} is {self.state}."
177+
" Use `monarch.tools.commands.get_or_create()` to create a new server"
178+
)
179+
else:
180+
raise RuntimeError(
181+
f"{self.server_handle} is in an invalid state: {self.state}. Please report this as a bug"
182+
)
183+
136184
def get_mesh_spec(self, mesh_name: str) -> MeshSpec:
137185
for mesh_spec in self.meshes:
138186
if mesh_spec.name == mesh_name:

python/tests/tools/test_mesh_spec.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,17 @@ def get_test_server_spec(self) -> ServerSpec:
180180
],
181181
)
182182

183+
def test_server_handle(self) -> None:
184+
unused = specs.AppState.RUNNING
185+
186+
server = ServerSpec(name="foo", scheduler="slurm", meshes=[], state=unused)
187+
self.assertEqual("slurm:///foo", server.server_handle)
188+
189+
server = ServerSpec(
190+
name="foo", scheduler="slurm", namespace="prod", meshes=[], state=unused
191+
)
192+
self.assertEqual("slurm://prod/foo", server.server_handle)
193+
183194
def test_get_mesh_spec(self) -> None:
184195
server_spec = self.get_test_server_spec()
185196
mesh_spec = server_spec.get_mesh_spec("trainer")
@@ -196,3 +207,55 @@ def test_get_mesh_spec_not_found(self) -> None:
196207
r"Mesh: 'worker' not found in job: monarch-foo-1a2b3c. Try one of: \['trainer', 'generator'\]",
197208
):
198209
server_spec.get_mesh_spec("worker")
210+
211+
def _1_mesh_2_host_server_spec(self, state: specs.AppState) -> ServerSpec:
212+
return ServerSpec(
213+
name="foo",
214+
scheduler="slurm",
215+
meshes=[
216+
MeshSpec(
217+
name="trainer",
218+
num_hosts=2,
219+
hostnames=["compute-node-0", "compute-node-1"],
220+
)
221+
],
222+
state=state,
223+
)
224+
225+
def test_node0(self) -> None:
226+
server = self._1_mesh_2_host_server_spec(specs.AppState.RUNNING)
227+
self.assertEqual("compute-node-0", server.host0("trainer"))
228+
229+
def test_node0_server_in_terminal_state(self) -> None:
230+
for terminal_state in [
231+
specs.AppState.FAILED,
232+
specs.AppState.SUCCEEDED,
233+
specs.AppState.CANCELLED,
234+
]:
235+
with self.subTest(terminal_state=terminal_state):
236+
server = self._1_mesh_2_host_server_spec(terminal_state)
237+
with self.assertRaisesRegex(
238+
RuntimeError,
239+
r"Use `monarch.tools.commands.get_or_create\(\)` to create a new server",
240+
):
241+
server.host0("trainer")
242+
243+
def test_node0_server_in_pending_state(self) -> None:
244+
for pending_state in [specs.AppState.SUBMITTED, specs.AppState.PENDING]:
245+
with self.subTest(pending_state=pending_state):
246+
server = self._1_mesh_2_host_server_spec(pending_state)
247+
with self.assertRaisesRegex(
248+
RuntimeError,
249+
r"Use `monarch.tools.commands.server_ready\(\)` to wait for the server to be RUNNING",
250+
):
251+
server.host0("trainer")
252+
253+
def test_node0_server_in_illegal_tate(self) -> None:
254+
for illegal_state in [specs.AppState.UNSUBMITTED, specs.AppState.UNKNOWN]:
255+
with self.subTest(illegal_state=illegal_state):
256+
server = self._1_mesh_2_host_server_spec(illegal_state)
257+
with self.assertRaisesRegex(
258+
RuntimeError,
259+
r"Please report this as a bug",
260+
):
261+
server.host0("trainer")

0 commit comments

Comments
 (0)