Skip to content

Commit ff6be20

Browse files
authored
infra: include granular buildspecs for dlc and generic cpu and gpu testing (#413)
1 parent 92eb20a commit ff6be20

8 files changed

+219
-88
lines changed

buildspec-container-pr.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ phases:
44
pre_build:
55
commands:
66
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
7-
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.'
7+
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from a pull request.'
88

99
build:
1010
commands:

buildspec-dlc-cpu-tests.yml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
version: 0.2
2+
3+
env:
4+
variables:
5+
FRAMEWORK_VERSION: '2.3.1'
6+
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
7+
ECR_REPO: 'sagemaker-test'
8+
9+
phases:
10+
pre_build:
11+
commands:
12+
- start-dockerd
13+
- ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text)
14+
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
15+
- BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
16+
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from a pull request.'
17+
18+
build:
19+
commands:
20+
- TOX_PARALLEL_NO_SPINNER=1
21+
- PY_COLORS=0
22+
23+
# define tags
24+
- DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID"
25+
26+
# establish common test options
27+
- TEST_OPTS=" --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"
28+
29+
# run local CPU integration tests (build and push the image to ECR repo)
30+
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --build-image --push-image $TEST_OPTS"
31+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-dlc-cpu-tests.yml"
32+
33+
# run sagemaker CPU sagemaker integration tests
34+
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --instance-type $CPU_INSTANCE_TYPE $TEST_OPTS"
35+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-dlc-cpu-tests.yml"
36+
finally:
37+
# remove ECR image
38+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG

buildspec-dlc-gpu-tests.yml

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
version: 0.2
2+
3+
env:
4+
variables:
5+
FRAMEWORK_VERSION: '2.3.1'
6+
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
7+
ECR_REPO: 'sagemaker-test'
8+
GITHUB_REPO: 'sagemaker-tensorflow-training-toolkit'
9+
DLC_ACCOUNT: '763104351884'
10+
SETUP_FILE: 'setup_cmds.sh'
11+
SETUP_CMDS: '#!/bin/bash\npython3.6 -m pip install --upgrade pip==20.3.1\npython3.6 -m pip install -U .\npython3.6 -m pip install -U .[test]'
12+
13+
phases:
14+
pre_build:
15+
commands:
16+
- start-dockerd
17+
- ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text)
18+
- PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO"
19+
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
20+
- BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
21+
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from a pull request.'
22+
23+
build:
24+
commands:
25+
- TOX_PARALLEL_NO_SPINNER=1
26+
- PY_COLORS=0
27+
28+
# define tags
29+
- DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID"
30+
31+
# establish common test options
32+
- TEST_OPTS=" --dockerfile-type dlc.gpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
33+
34+
# build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
35+
- python3.6 setup.py sdist
36+
- build_dir="test/container/$FRAMEWORK_VERSION"
37+
- $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
38+
- docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
39+
# push DLC GPU image to ECR
40+
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
41+
- docker push $PREPROD_IMAGE:$DLC_GPU_TAG
42+
43+
# launch remote GPU instance
44+
- prefix='ml.'
45+
- instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
46+
- create-key-pair
47+
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest
48+
49+
# run dlc gpu local tests on remote host
50+
- printf "$SETUP_CMDS" > $SETUP_FILE
51+
- dlc_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local $TEST_OPTS"
52+
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
53+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-dlc-gpu-tests.yml"
54+
55+
# run GPU sagemaker integration tests
56+
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --instance-type $GPU_INSTANCE_TYPE $TEST_OPTS"
57+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-dlc-gpu-tests.yml"
58+
finally:
59+
# shut down remote GPU instance
60+
- cleanup-gpu-instances
61+
- cleanup-key-pairs
62+
63+
# remove ECR image
64+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG

buildspec-gen-cpu-tests.yml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
version: 0.2
2+
3+
env:
4+
variables:
5+
FRAMEWORK_VERSION: '2.3.1'
6+
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
7+
ECR_REPO: 'sagemaker-test'
8+
9+
phases:
10+
pre_build:
11+
commands:
12+
- start-dockerd
13+
- ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text)
14+
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
15+
- BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
16+
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from a pull request.'
17+
18+
build:
19+
commands:
20+
- TOX_PARALLEL_NO_SPINNER=1
21+
- PY_COLORS=0
22+
23+
# define tags
24+
- GEN_CPU_TAG="$FRAMEWORK_VERSION-gen-cpu-$BUILD_ID"
25+
26+
# establish common test options
27+
- TEST_OPTS=" --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GEN_CPU_TAG"
28+
29+
# run local CPU integration tests (build and push the image to ECR repo)
30+
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --build-image --push-image $TEST_OPTS"
31+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-gen-cpu-tests.yml"
32+
33+
# run CPU sagemaker integration tests
34+
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --instance-type $CPU_INSTANCE_TYPE $TEST_OPTS"
35+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-gen-cpu-tests.yml"
36+
finally:
37+
# remove ECR image
38+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GEN_CPU_TAG

buildspec-gen-gpu-tests.yml

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
version: 0.2
2+
3+
env:
4+
variables:
5+
FRAMEWORK_VERSION: '2.3.1'
6+
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
7+
ECR_REPO: 'sagemaker-test'
8+
GITHUB_REPO: 'sagemaker-tensorflow-training-toolkit'
9+
DLC_ACCOUNT: '763104351884'
10+
SETUP_FILE: 'setup_cmds.sh'
11+
SETUP_CMDS: '#!/bin/bash\npython3.6 -m pip install --upgrade pip==20.3.1\npython3.6 -m pip install -U .\npython3.6 -m pip install -U .[test]'
12+
13+
phases:
14+
pre_build:
15+
commands:
16+
- start-dockerd
17+
- ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text)
18+
- PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO"
19+
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
20+
- BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
21+
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from a pull request.'
22+
23+
build:
24+
commands:
25+
- TOX_PARALLEL_NO_SPINNER=1
26+
- PY_COLORS=0
27+
28+
# define tags
29+
- GEN_GPU_TAG="$FRAMEWORK_VERSION-gen-gpu-$BUILD_ID"
30+
31+
# establish common test options
32+
- TEST_OPTS=" --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GEN_GPU_TAG"
33+
34+
# build Generic GPU image on build host instead of GPU instance
35+
- python3.6 setup.py sdist
36+
- build_dir="test/container/$FRAMEWORK_VERSION"
37+
- docker build -f "$build_dir/Dockerfile.tf" -t $PREPROD_IMAGE:$GEN_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
38+
# push Generic GPU image to ECR
39+
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
40+
- docker push $PREPROD_IMAGE:$GEN_GPU_TAG
41+
42+
# launch remote GPU instance
43+
- prefix='ml.'
44+
- instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
45+
- create-key-pair
46+
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest
47+
48+
# run generic gpu local tests on remote host
49+
- printf "$SETUP_CMDS" > $SETUP_FILE
50+
- generic_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local $TEST_OPTS"
51+
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
52+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-gen-gpu-tests.yml"
53+
54+
# run GPU sagemaker integration tests
55+
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --instance-type $GPU_INSTANCE_TYPE $TEST_OPTS"
56+
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-gen-gpu-tests.yml"
57+
finally:
58+
# shut down remote GPU instance
59+
- cleanup-gpu-instances
60+
- cleanup-key-pairs
61+
62+
# remove ECR image
63+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GEN_GPU_TAG

buildspec-release.yml

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,6 @@ phases:
1414
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION=
1515
tox -e py36,py37 --parallel all -- test/unit
1616

17-
# run local integ tests
18-
#- $(aws ecr get-login --no-include-email --region us-west-2)
19-
#- IGNORE_COVERAGE=- tox -e py37 -- test/integ/local
20-
21-
# run sagemaker integ tests
22-
#- IGNORE_COVERAGE=- tox -e py37 -- test/integ/sagemaker
23-
24-
# generate the distribution package
25-
- python3 setup.py sdist
26-
2717
# publish the release to github
2818
- git-release --publish
2919

buildspec.yml

Lines changed: 0 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,6 @@
11
version: 0.2
22

3-
env:
4-
variables:
5-
FRAMEWORK_VERSION: '2.3.1'
6-
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
7-
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
8-
ECR_REPO: 'sagemaker-test'
9-
GITHUB_REPO: 'sagemaker-tensorflow-container'
10-
DLC_ACCOUNT: '763104351884'
11-
SETUP_FILE: 'setup_cmds.sh'
12-
SETUP_CMDS: '#!/bin/bash\npython3.6 -m pip install --upgrade pip==20.3.1\npython3.6 -m pip install -U .\npython3.6 -m pip install -U .[test]'
13-
143
phases:
15-
pre_build:
16-
commands:
17-
- start-dockerd
18-
- ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text)
19-
- PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO"
20-
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
21-
- BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
22-
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.'
23-
244
build:
255
commands:
266
- TOX_PARALLEL_NO_SPINNER=1
@@ -31,60 +11,3 @@ phases:
3111

3212
# run unit tests
3313
- tox -e py36,py37 --parallel all test/unit
34-
35-
# define tags
36-
- GENERIC_TAG="$FRAMEWORK_VERSION-tensorflow-$BUILD_ID"
37-
- DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID"
38-
- DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID"
39-
40-
# run local CPU integration tests (build and push the image to ECR repo)
41-
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --build-image --push-image --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG"
42-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
43-
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"
44-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
45-
46-
# launch remote GPU instance
47-
- prefix='ml.'
48-
- instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
49-
- create-key-pair
50-
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest
51-
52-
# build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
53-
- python3.6 setup.py sdist
54-
- build_dir="test/container/$FRAMEWORK_VERSION"
55-
- $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
56-
- docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
57-
# push DLC GPU image to ECR
58-
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
59-
- docker push $PREPROD_IMAGE:$DLC_GPU_TAG
60-
61-
# run GPU local integration tests
62-
- printf "$SETUP_CMDS" > $SETUP_FILE
63-
# no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests
64-
- generic_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
65-
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
66-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
67-
- dlc_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/local --dockerfile-type dlc.gpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
68-
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup"
69-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
70-
71-
# run CPU sagemaker integration tests
72-
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
73-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
74-
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
75-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
76-
77-
# run GPU sagemaker integration tests
78-
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --dockerfile-type tf --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
79-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
80-
- test_cmd="IGNORE_COVERAGE=- tox -e py36 -- test/integration/sagemaker -n auto --reruns 3 --reruns-delay 15 --dockerfile-type dlc.gpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --account-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
81-
- execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"
82-
finally:
83-
# shut down remote GPU instance
84-
- cleanup-gpu-instances
85-
- cleanup-key-pairs
86-
87-
# remove ECR image
88-
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG
89-
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG
90-
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG

test/unit/test_training.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,21 @@ def test_train_horovod(run_module, single_machine_training_env):
107107
)
108108

109109

110+
@patch("sagemaker_training.entry_point.run")
111+
def test_train_smdataparallel(run_module, single_machine_training_env):
112+
single_machine_training_env.additional_framework_parameters["sagemaker_distributed_dataparallel_enabled"] = True
113+
114+
training.train(single_machine_training_env, MODEL_DIR_CMD_LIST)
115+
run_module.assert_called_with(
116+
uri=MODULE_DIR,
117+
user_entry_point=MODULE_NAME,
118+
args=MODEL_DIR_CMD_LIST,
119+
env_vars=single_machine_training_env.to_env_vars(),
120+
capture_error=True,
121+
runner_type=runner.SMDataParallelRunnerType,
122+
)
123+
124+
110125
@pytest.mark.skip_on_pipeline
111126
@pytest.mark.skipif(
112127
sys.version_info.major != 3, reason="Skip this for python 2 because of dict key order mismatch"

0 commit comments

Comments
 (0)