Skip to content

Commit 89c31e7

Browse files
authored
infra: add integration test for MPI env vars propagation (#395)
1 parent de54357 commit 89c31e7

File tree

2 files changed

+64
-17
lines changed

2 files changed

+64
-17
lines changed

test/integration/sagemaker/test_horovod.py

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,35 +19,63 @@
1919
from sagemaker.tensorflow import TensorFlow
2020
from sagemaker.utils import unique_name_from_base
2121

22-
RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
22+
RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources")
2323

2424

2525
@pytest.mark.skip_generic
26-
def test_distributed_training_horovod(sagemaker_session,
27-
instance_type,
28-
image_uri,
29-
tmpdir,
30-
framework_version):
26+
def test_distributed_training_horovod(
27+
sagemaker_session, instance_type, image_uri, tmpdir, framework_version
28+
):
3129

32-
mpi_options = '-verbose -x orte_base_help_aggregate=0'
30+
mpi_options = "-verbose -x orte_base_help_aggregate=0"
3331
estimator = TensorFlow(
34-
entry_point=os.path.join(RESOURCE_PATH, 'mnist', 'horovod_mnist.py'),
35-
role='SageMakerRole',
32+
entry_point=os.path.join(RESOURCE_PATH, "mnist", "horovod_mnist.py"),
33+
role="SageMakerRole",
3634
train_instance_type=instance_type,
3735
train_instance_count=2,
3836
image_name=image_uri,
3937
framework_version=framework_version,
40-
py_version='py3',
38+
py_version="py3",
4139
script_mode=True,
42-
hyperparameters={'sagemaker_mpi_enabled': True,
43-
'sagemaker_mpi_custom_mpi_options': mpi_options,
44-
'sagemaker_mpi_num_of_processes_per_host': 1},
45-
sagemaker_session=sagemaker_session)
40+
hyperparameters={
41+
"sagemaker_mpi_enabled": True,
42+
"sagemaker_mpi_custom_mpi_options": mpi_options,
43+
"sagemaker_mpi_num_of_processes_per_host": 1,
44+
},
45+
sagemaker_session=sagemaker_session,
46+
)
4647

47-
estimator.fit(job_name=unique_name_from_base('test-tf-horovod'))
48+
estimator.fit(job_name=unique_name_from_base("test-tf-horovod"))
4849

4950
model_data_source = sagemaker.local.data.get_data_source_instance(
50-
estimator.model_data, sagemaker_session)
51+
estimator.model_data, sagemaker_session
52+
)
5153

5254
for filename in model_data_source.get_file_list():
53-
assert os.path.basename(filename) == 'model.tar.gz'
55+
assert os.path.basename(filename) == "model.tar.gz"
56+
57+
58+
@pytest.mark.skip_generic
59+
def test_distributed_training_horovod_with_env_vars(
60+
sagemaker_session, instance_type, image_uri, tmpdir, framework_version
61+
):
62+
63+
mpi_options = "-verbose -x orte_base_help_aggregate=0"
64+
estimator = TensorFlow(
65+
entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_env_vars.py"),
66+
role="SageMakerRole",
67+
train_instance_type=instance_type,
68+
train_instance_count=2,
69+
image_name=image_uri,
70+
framework_version=framework_version,
71+
py_version="py3",
72+
script_mode=True,
73+
hyperparameters={
74+
"sagemaker_mpi_enabled": True,
75+
"sagemaker_mpi_custom_mpi_options": mpi_options,
76+
"sagemaker_mpi_num_of_processes_per_host": 2,
77+
},
78+
sagemaker_session=sagemaker_session,
79+
)
80+
81+
estimator.fit(job_name=unique_name_from_base("test-tf-horovod-env-vars"))
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import json
2+
import os
3+
import horovod.tensorflow as hvd
4+
5+
hvd.init()
6+
7+
with open('/opt/ml/model/local-rank-%s-rank-%s' % (hvd.local_rank(), hvd.rank()), 'w+') as f:
8+
basic_info = {'local-rank': hvd.local_rank(), 'rank': hvd.rank(), 'size': hvd.size()}
9+
10+
print(basic_info)
11+
json.dump(basic_info, f)
12+
13+
val = os.environ.get('AWS_CONTAINER_CREDENTIALS_RELATIVE_URI')
14+
host = os.environ.get('SM_CURRENT_HOST')
15+
16+
assert val is not None
17+
assert host is not None
18+
19+
print('host {}: AWS_CONTAINER_CREDENTIALS_RELATIVE_URI={}'.format(host, val))

0 commit comments

Comments
 (0)