Skip to content

Commit 05a785c

Browse files
authored
feature: include sm-data-distributed and upgrade dependencies (#410)
1 parent 2dae290 commit 05a785c

File tree

12 files changed

+49
-46
lines changed

12 files changed

+49
-46
lines changed

CODE_OF_CONDUCT.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
## Code of Conduct
2+
This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3+
For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4+
opensource-codeofconduct@amazon.com with any additional questions or comments.

NOTICE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

buildspec-release.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@ phases:
1212
# run unit tests
1313
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
1414
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION=
15-
tox -e py27,py36,py37 --parallel all -- test/unit
15+
tox -e py36,py37 --parallel all -- test/unit
1616

1717
# run local integ tests
1818
#- $(aws ecr get-login --no-include-email --region us-west-2)
19-
#- IGNORE_COVERAGE=- tox -e py27,py36 -- test/integ/local
19+
#- IGNORE_COVERAGE=- tox -e py37 -- test/integ/local
2020

2121
# run sagemaker integ tests
22-
#- IGNORE_COVERAGE=- tox -e py27,py36 -- test/integ/sagemaker
22+
#- IGNORE_COVERAGE=- tox -e py37 -- test/integ/sagemaker
2323

2424
# generate the distribution package
2525
- python3 setup.py sdist

buildspec.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ env:
99
GITHUB_REPO: 'sagemaker-tensorflow-container'
1010
DLC_ACCOUNT: '763104351884'
1111
SETUP_FILE: 'setup_cmds.sh'
12-
SETUP_CMDS: '#!/bin/bash\npython3.6 -m pip install --upgrade pip\npython3.6 -m pip install -U -e .\npython3.6 -m pip install -U -e .[test]'
12+
SETUP_CMDS: '#!/bin/bash\npython3.6 -m pip install --upgrade pip==20.3.1\npython3.6 -m pip install -U .\npython3.6 -m pip install -U .[test]'
1313

1414
phases:
1515
pre_build:
@@ -30,7 +30,7 @@ phases:
3030
- tox -e flake8,twine
3131

3232
# run unit tests
33-
- tox -e py27,py36,py37 --parallel all test/unit
33+
- tox -e py36,py37 --parallel all test/unit
3434

3535
# define tags
3636
- GENERIC_TAG="$FRAMEWORK_VERSION-tensorflow-$BUILD_ID"

setup.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,14 @@ def read_version():
3636
"pytest-cov",
3737
"pytest-xdist",
3838
"mock",
39-
"sagemaker==1.51.3",
39+
"sagemaker[local]>=2",
4040
"tensorflow<2.4",
4141
"docker-compose",
42-
"boto3==1.10.50",
43-
"six==1.13.0",
42+
"boto3==1.16.34",
4443
"python-dateutil>=2.1,<2.8.1",
45-
"botocore==1.13.50",
44+
"botocore==1.19.34",
4645
"requests-mock",
47-
"awscli==1.16.314",
46+
"awscli==1.18.194",
4847
]
4948

5049
if sys.version_info.major > 2:
@@ -68,12 +67,11 @@ def read_version():
6867
"Natural Language :: English",
6968
"License :: OSI Approved :: Apache Software License",
7069
"Programming Language :: Python",
71-
"Programming Language :: Python :: 2.7",
7270
"Programming Language :: Python :: 3.6",
7371
"Programming Language :: Python :: 3.7",
7472
],
7573
install_requires=[
76-
"sagemaker-training>=3.6.4",
74+
"sagemaker-training>=3.7.0",
7775
"numpy",
7876
"scipy",
7977
"sklearn",

src/sagemaker_tensorflow_container/training.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
logger = logging.getLogger(__name__)
2828

2929
SAGEMAKER_PARAMETER_SERVER_ENABLED = "sagemaker_parameter_server_enabled"
30+
SAGEMAKER_DISTRIBUTED_DATAPARALLEL_ENABLED = "sagemaker_distributed_dataparallel_enabled"
3031
MODEL_DIR = "/opt/ml/model"
3132

3233

@@ -136,6 +137,9 @@ def train(env, cmd_args):
136137
parameter_server_enabled = env.additional_framework_parameters.get(
137138
SAGEMAKER_PARAMETER_SERVER_ENABLED, False
138139
)
140+
sagemaker_distributed_dataparallel_enabled = env.additional_framework_parameters.get(
141+
SAGEMAKER_DISTRIBUTED_DATAPARALLEL_ENABLED, False
142+
)
139143
if len(env.hosts) > 1 and parameter_server_enabled:
140144

141145
tf_config = _build_tf_config(hosts=env.hosts, current_host=env.current_host)
@@ -155,6 +159,8 @@ def train(env, cmd_args):
155159

156160
if mpi_enabled:
157161
runner_type = runner.MPIRunnerType
162+
elif sagemaker_distributed_dataparallel_enabled:
163+
runner_type = runner.SMDataParallelRunnerType
158164
else:
159165
runner_type = runner.ProcessRunnerType
160166

test/integration/local/test_horovod.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,11 @@ def _test_distributed_training_horovod(
5151
estimator = TensorFlow(
5252
entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_basic.py"),
5353
role="SageMakerRole",
54-
train_instance_type=instance_type,
54+
instance_type=instance_type,
5555
sagemaker_session=session,
56-
train_instance_count=instances,
57-
image_name=image_uri,
56+
instance_count=instances,
57+
image_uri=image_uri,
5858
output_path=output_path,
59-
framework_version=framework_version,
6059
hyperparameters={
6160
"sagemaker_mpi_enabled": True,
6261
"sagemaker_network_interface_name": "eth0",

test/integration/local/test_training.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,10 @@ def run_tf_training(
102102
estimator = TensorFlow(
103103
entry_point=script,
104104
role="SageMakerRole",
105-
train_instance_count=instance_count,
106-
train_instance_type=instance_type,
105+
instance_count=instance_count,
106+
instance_type=instance_type,
107107
sagemaker_session=sagemaker_local_session,
108-
image_name=image_uri,
108+
image_uri=image_uri,
109109
model_dir="/opt/ml/model",
110110
output_path=output_path,
111111
hyperparameters=hyperparameters,

test/integration/sagemaker/test_horovod.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,11 @@ def test_distributed_training_horovod(
3131
estimator = TensorFlow(
3232
entry_point=os.path.join(RESOURCE_PATH, "mnist", "horovod_mnist.py"),
3333
role="SageMakerRole",
34-
train_instance_type=instance_type,
35-
train_instance_count=2,
36-
image_name=image_uri,
34+
instance_type=instance_type,
35+
instance_count=2,
36+
image_uri=image_uri,
3737
framework_version=framework_version,
3838
py_version="py3",
39-
script_mode=True,
4039
hyperparameters={
4140
"sagemaker_mpi_enabled": True,
4241
"sagemaker_mpi_custom_mpi_options": mpi_options,
@@ -64,12 +63,11 @@ def test_distributed_training_horovod_with_env_vars(
6463
estimator = TensorFlow(
6564
entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_env_vars.py"),
6665
role="SageMakerRole",
67-
train_instance_type=instance_type,
68-
train_instance_count=2,
69-
image_name=image_uri,
66+
instance_type=instance_type,
67+
instance_count=2,
68+
image_uri=image_uri,
7069
framework_version=framework_version,
7170
py_version="py3",
72-
script_mode=True,
7371
hyperparameters={
7472
"sagemaker_mpi_enabled": True,
7573
"sagemaker_mpi_custom_mpi_options": mpi_options,

test/integration/sagemaker/test_mnist.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,11 @@ def test_mnist(sagemaker_session, image_uri, instance_type, framework_version):
3131
estimator = TensorFlow(
3232
entry_point=script,
3333
role="SageMakerRole",
34-
train_instance_type=instance_type,
35-
train_instance_count=1,
34+
instance_type=instance_type,
35+
instance_count=1,
3636
sagemaker_session=sagemaker_session,
37-
image_name=image_uri,
37+
image_uri=image_uri,
3838
framework_version=framework_version,
39-
script_mode=True,
4039
)
4140
inputs = estimator.sagemaker_session.upload_data(
4241
path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist"
@@ -51,12 +50,11 @@ def test_distributed_mnist_no_ps(sagemaker_session, image_uri, instance_type, fr
5150
estimator = TensorFlow(
5251
entry_point=script,
5352
role="SageMakerRole",
54-
train_instance_count=2,
55-
train_instance_type=instance_type,
53+
instance_count=2,
54+
instance_type=instance_type,
5655
sagemaker_session=sagemaker_session,
57-
image_name=image_uri,
56+
image_uri=image_uri,
5857
framework_version=framework_version,
59-
script_mode=True,
6058
)
6159
inputs = estimator.sagemaker_session.upload_data(
6260
path=os.path.join(resource_path, "mnist", "data"), key_prefix="scriptmode/mnist"
@@ -72,12 +70,11 @@ def test_distributed_mnist_ps(sagemaker_session, image_uri, instance_type, frame
7270
entry_point=script,
7371
role="SageMakerRole",
7472
hyperparameters={"sagemaker_parameter_server_enabled": True},
75-
train_instance_count=2,
76-
train_instance_type=instance_type,
73+
instance_count=2,
74+
instance_type=instance_type,
7775
sagemaker_session=sagemaker_session,
78-
image_name=image_uri,
76+
image_uri=image_uri,
7977
framework_version=framework_version,
80-
script_mode=True,
8178
)
8279
inputs = estimator.sagemaker_session.upload_data(
8380
path=os.path.join(resource_path, "mnist", "data-distributed"),
@@ -95,10 +92,10 @@ def test_tuning(sagemaker_session, image_uri, instance_type, framework_version):
9592
estimator = TensorFlow(
9693
entry_point=script,
9794
role="SageMakerRole",
98-
train_instance_type=instance_type,
99-
train_instance_count=1,
95+
instance_type=instance_type,
96+
instance_count=1,
10097
sagemaker_session=sagemaker_session,
101-
image_name=image_uri,
98+
image_uri=image_uri,
10299
framework_version=framework_version,
103100
script_mode=True,
104101
)

test/integration/sagemaker/test_tuning_model_dir.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@ def test_model_dir_with_training_job_name(
2828
estimator = TensorFlow(
2929
entry_point=script,
3030
role="SageMakerRole",
31-
train_instance_type=instance_type,
32-
train_instance_count=1,
33-
image_name=image_uri,
31+
instance_type=instance_type,
32+
instance_count=1,
33+
image_uri=image_uri,
3434
framework_version=framework_version,
3535
py_version="py3",
3636
sagemaker_session=sagemaker_session,

tox.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# and then run "tox" from this directory.
55

66
[tox]
7-
envlist = py27,py36,py37,flake8
7+
envlist = py36,py37,flake8
88
skip_missing_interpreters = False
99

1010
[travis]

0 commit comments

Comments
 (0)