Skip to content

Commit 2679b28

Browse files
authored
infra: refactor toolkit tests. (#333)
1 parent c8af155 commit 2679b28

23 files changed

+270
-874
lines changed

.flake8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
[flake8]
2-
application_import_names = integration, sagemaker_tensorflow_container, test, test-toolkit, timeout, utils
2+
application_import_names = image_utils, integration, sagemaker_tensorflow_container, test, test-toolkit, timeout, utils
33
import-order-style = google

buildspec-toolkit.yml

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
version: 0.2
2+
3+
env:
4+
variables:
5+
FRAMEWORK_VERSION: '2.1.0'
6+
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
7+
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
8+
ECR_REPO: 'sagemaker-test'
9+
GITHUB_REPO: 'sagemaker-tensorflow-container'
10+
DLC_ACCOUNT: '763104351884'
11+
SETUP_FILE: 'setup_cmds.sh'
12+
SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]'
13+
14+
phases:
15+
pre_build:
16+
commands:
17+
- start-dockerd
18+
- ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text)
19+
- PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO"
20+
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
21+
- BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
22+
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.'
23+
24+
build:
25+
commands:
26+
- TOX_PARALLEL_NO_SPINNER=1
27+
- PY_COLORS=0
28+
29+
# install
30+
- pip3 install -U -e .[test]
31+
32+
# run linters
33+
- tox -e flake8,twine
34+
35+
# run unit tests
36+
- tox -e py36,py27 test-toolkit/unit
37+
38+
# define tags
39+
- GENERIC_TAG="$FRAMEWORK_VERSION-tensorflow-$BUILD_ID"
40+
- DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID"
41+
- DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID"
42+
43+
# run local CPU integration tests (build and push the image to ECR repo)
44+
- test_cmd="pytest test-toolkit/integration/local --build-image --push-image --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG"
45+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml"
46+
- test_cmd="pytest test-toolkit/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"
47+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml"
48+
49+
# launch remote GPU instance
50+
- prefix='ml.'
51+
- instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
52+
- create-key-pair
53+
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest
54+
55+
# build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
56+
- python3 setup.py sdist
57+
- build_dir="test-toolkit/docker/$FRAMEWORK_VERSION"
58+
- $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
59+
- docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
60+
# push DLC GPU image to ECR
61+
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
62+
- docker push $PREPROD_IMAGE:$DLC_GPU_TAG
63+
64+
# run GPU local integration tests
65+
- printf "$SETUP_CMDS" > $SETUP_FILE
66+
# no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests
67+
- generic_cmd="pytest test-toolkit/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
68+
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
69+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml"
70+
- dlc_cmd="pytest test-toolkit/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
71+
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup"
72+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml"
73+
74+
# run CPU sagemaker integration tests
75+
- test_cmd="pytest test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
76+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml"
77+
- test_cmd="pytest test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
78+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml"
79+
80+
# run GPU sagemaker integration tests
81+
- test_cmd="pytest test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
82+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml"
83+
- test_cmd="pytest test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
84+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml"
85+
finally:
86+
# shut down remote GPU instance
87+
- cleanup-gpu-instances
88+
- cleanup-key-pairs
89+
90+
# remove ECR image
91+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG
92+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG
93+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
ARG region
2+
FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.1.0-cpu-py2
3+
4+
COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
5+
RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
6+
rm /sagemaker_tensorflow_training.tar.gz
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
ARG region
2+
FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.1.0-gpu-py3
3+
4+
COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
5+
RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
6+
rm /sagemaker_tensorflow_training.tar.gz
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
FROM tensorflow/tensorflow:2.1.0-gpu-py3
2+
3+
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
4+
5+
COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz
6+
RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \
7+
rm /sagemaker_tensorflow_training.tar.gz

test-toolkit/integration/conftest.py

Lines changed: 63 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@
1818
import boto3
1919
import pytest
2020
from sagemaker import LocalSession, Session
21-
from sagemaker.tensorflow import TensorFlow
2221

23-
from test.integration import NO_P2_REGIONS, NO_P3_REGIONS
22+
from integration import image_utils
23+
from integration import NO_P2_REGIONS, NO_P3_REGIONS
24+
2425

2526
logger = logging.getLogger(__name__)
2627
logging.getLogger('boto').setLevel(logging.INFO)
@@ -29,14 +30,19 @@
2930
logging.getLogger('auth.py').setLevel(logging.INFO)
3031
logging.getLogger('connectionpool.py').setLevel(logging.INFO)
3132

32-
SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
33+
DIR_PATH = os.path.dirname(os.path.realpath(__file__))
3334

3435

3536
def pytest_addoption(parser):
36-
parser.addoption('--docker-base-name', default='sagemaker-tensorflow-scriptmode')
37+
parser.addoption('--build-image', '-B', action='store_true')
38+
parser.addoption('--push-image', '-P', action='store_true')
39+
parser.addoption('--dockerfile-type', '-T', choices=['dlc.cpu', 'dlc.gpu', 'tf'],
40+
default='tf')
41+
parser.addoption('--dockerfile', '-D', default=None)
42+
parser.addoption('--docker-base-name', default='sagemaker-tensorflow-training')
3743
parser.addoption('--tag', default=None)
3844
parser.addoption('--region', default='us-west-2')
39-
parser.addoption('--framework-version', default=TensorFlow.LATEST_VERSION)
45+
parser.addoption('--framework-version', default='2.1.0')
4046
parser.addoption('--processor', default='cpu', choices=['cpu', 'gpu', 'cpu,gpu'])
4147
parser.addoption('--py-version', default='3', choices=['2', '3', '2,3'])
4248
parser.addoption('--account-id', default='142577830533')
@@ -48,6 +54,38 @@ def pytest_configure(config):
4854
os.environ['TEST_PROCESSORS'] = config.getoption('--processor')
4955

5056

57+
@pytest.fixture(scope='session', name='dockerfile_type')
58+
def fixture_dockerfile_type(request):
59+
return request.config.getoption('--dockerfile-type')
60+
61+
62+
@pytest.fixture(scope='session', name='dockerfile')
63+
def fixture_dockerfile(request, dockerfile_type):
64+
dockerfile = request.config.getoption('--dockerfile')
65+
return dockerfile if dockerfile else 'Dockerfile.{}'.format(dockerfile_type)
66+
67+
68+
@pytest.fixture(scope='session', name='build_image', autouse=True)
69+
def fixture_build_image(request, framework_version, dockerfile, image_uri, region):
70+
build_image = request.config.getoption('--build-image')
71+
if build_image:
72+
return image_utils.build_image(framework_version=framework_version,
73+
dockerfile=dockerfile,
74+
image_uri=image_uri,
75+
region=region,
76+
cwd=os.path.join(DIR_PATH, '..', '..'))
77+
78+
return image_uri
79+
80+
81+
@pytest.fixture(scope='session', name='push_image', autouse=True)
82+
def fixture_push_image(request, image_uri, region, account_id):
83+
push_image = request.config.getoption('--push-image')
84+
if push_image:
85+
return image_utils.push_image(image_uri, region, account_id)
86+
return None
87+
88+
5189
@pytest.fixture(scope='session')
5290
def docker_base_name(request):
5391
return request.config.getoption('--docker-base-name')
@@ -63,7 +101,7 @@ def framework_version(request):
63101
return request.config.getoption('--framework-version')
64102

65103

66-
@pytest.fixture
104+
@pytest.fixture(scope='session')
67105
def tag(request, framework_version, processor, py_version):
68106
provided_tag = request.config.getoption('--tag')
69107
default_tag = '{}-{}-py{}'.format(framework_version, processor, py_version)
@@ -92,20 +130,6 @@ def instance_type(request, processor):
92130
return provided_instance_type if provided_instance_type is not None else default_instance_type
93131

94132

95-
@pytest.fixture()
96-
def py_version():
97-
if 'TEST_PY_VERSIONS' in os.environ:
98-
return os.environ['TEST_PY_VERSIONS'].split(',')
99-
return None
100-
101-
102-
@pytest.fixture()
103-
def processor():
104-
if 'TEST_PROCESSORS' in os.environ:
105-
return os.environ['TEST_PROCESSORS'].split(',')
106-
return None
107-
108-
109133
@pytest.fixture(autouse=True)
110134
def skip_by_device_type(request, processor):
111135
is_gpu = (processor == 'gpu')
@@ -121,19 +145,27 @@ def skip_gpu_instance_restricted_regions(region, instance_type):
121145
pytest.skip('Skipping GPU test in region {}'.format(region))
122146

123147

124-
@pytest.fixture
125-
def docker_image(docker_base_name, tag):
126-
return '{}:{}'.format(docker_base_name, tag)
127-
128-
129-
@pytest.fixture
130-
def ecr_image(account_id, docker_base_name, tag, region):
131-
return '{}.dkr.ecr.{}.amazonaws.com/{}:{}'.format(
132-
account_id, region, docker_base_name, tag)
133-
134-
135148
@pytest.fixture(autouse=True)
136149
def skip_py2_containers(request, tag):
137150
if request.node.get_closest_marker('skip_py2_containers'):
138151
if 'py2' in tag:
139152
pytest.skip('Skipping python2 container with tag {}'.format(tag))
153+
154+
155+
@pytest.fixture(autouse=True)
156+
def skip_by_dockerfile_type(request, dockerfile_type):
157+
is_generic = (dockerfile_type == 'tf')
158+
if request.node.get_closest_marker('skip_generic') and is_generic:
159+
pytest.skip('Skipping because running generic image without mpi and horovod')
160+
161+
162+
@pytest.fixture(name='docker_registry', scope='session')
163+
def fixture_docker_registry(account_id, region):
164+
return '{}.dkr.ecr.{}.amazonaws.com'.format(account_id, region) if account_id else None
165+
166+
167+
@pytest.fixture(name='image_uri', scope='session')
168+
def fixture_image_uri(docker_registry, docker_base_name, tag):
169+
if docker_registry:
170+
return '{}/{}:{}'.format(docker_registry, docker_base_name, tag)
171+
return '{}:{}'.format(docker_base_name, tag)
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
import os
16+
import subprocess
17+
import sys
18+
19+
CYAN_COLOR = '\033[36m'
20+
END_COLOR = '\033[0m'
21+
DLC_AWS_ID = '763104351884'
22+
23+
24+
def build_image(framework_version, dockerfile, image_uri, region, cwd='.'):
25+
_check_call('python setup.py sdist')
26+
27+
if 'dlc' in dockerfile:
28+
ecr_login(region, DLC_AWS_ID)
29+
30+
dockerfile_location = os.path.join('test-toolkit', 'docker', framework_version, dockerfile)
31+
32+
subprocess.check_call(
33+
['docker', 'build', '-t', image_uri, '-f', dockerfile_location, '--build-arg',
34+
'region={}'.format(region), cwd], cwd=cwd)
35+
print('created image {}'.format(image_uri))
36+
return image_uri
37+
38+
39+
def push_image(ecr_image, region, aws_id):
40+
ecr_login(region, aws_id)
41+
_check_call('docker push {}'.format(ecr_image))
42+
43+
44+
def ecr_login(region, aws_id):
45+
login = _check_call('aws ecr get-login --registry-ids {} '.format(aws_id)
46+
+ '--no-include-email --region {}'.format(region))
47+
_check_call(login.decode('utf-8').rstrip('\n'))
48+
49+
50+
def _check_call(cmd, *popenargs, **kwargs):
51+
if isinstance(cmd, str):
52+
cmd = cmd.split(" ")
53+
_print_cmd(cmd)
54+
return subprocess.check_output(cmd, *popenargs, **kwargs)
55+
56+
57+
def _print_cmd(cmd):
58+
print('executing docker command: {}{}{}'.format(CYAN_COLOR, ' '.join(cmd), END_COLOR))
59+
sys.stdout.flush()

test-toolkit/integration/local/test_horovod.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,13 @@
1919
import pytest
2020
from sagemaker.tensorflow import TensorFlow
2121

22-
from test.integration.utils import processor, py_version # noqa: F401
22+
from integration.utils import processor, py_version # noqa: F401
2323

2424
RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
2525

2626

2727
@pytest.mark.skip_gpu
28+
@pytest.mark.skip_generic
2829
@pytest.mark.parametrize('instances, processes', [
2930
[1, 2],
3031
(2, 1),
@@ -33,7 +34,7 @@
3334
def test_distributed_training_horovod_basic(instances,
3435
processes,
3536
sagemaker_local_session,
36-
docker_image,
37+
image_uri,
3738
tmpdir,
3839
framework_version):
3940
output_path = 'file://%s' % tmpdir
@@ -43,7 +44,7 @@ def test_distributed_training_horovod_basic(instances,
4344
train_instance_type='local',
4445
sagemaker_session=sagemaker_local_session,
4546
train_instance_count=instances,
46-
image_name=docker_image,
47+
image_name=image_uri,
4748
output_path=output_path,
4849
framework_version=framework_version,
4950
hyperparameters={'sagemaker_mpi_enabled': True,

0 commit comments

Comments
 (0)