Skip to content

Fixes for podman run, github actions #95

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test-mlperf-inference-mixtral.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,5 @@ jobs:
git config --global credential.helper store
huggingface-cli login --token ${{ secrets.HF_TOKEN }} --add-to-git-credential
cm pull repo
cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --pull_changes=yes --pull_inference_changes=yes --model=mixtral-8x7b --implementation=reference --batch_size=1 --precision=${{ matrix.precision }} --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker_it=no --docker_cm_repo=gateoverflow@mlperf-automations --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --docker --quiet --test_query_count=3 --target_qps=0.001 --clean --env.CM_MLPERF_MODEL_MIXTRAL_8X7B_DOWNLOAD_TO_HOST=yes --env.CM_MLPERF_DATASET_MIXTRAL_8X7B_DOWNLOAD_TO_HOST=yes --adr.openorca-mbxp-gsm8k-combined-preprocessed.tags=_size.1
cm run script --tags=run-mlperf,inference,_submission,_short --adr.inference-src.tags=_branch.dev --submitter="MLCommons" --pull_changes=yes --pull_inference_changes=yes --model=mixtral-8x7b --implementation=reference --batch_size=1 --precision=${{ matrix.precision }} --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker_it=no --docker_cm_repo=gateoverflow@mlperf-automations --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --docker --quiet --test_query_count=3 --target_qps=0.001 --clean --env.CM_MLPERF_MODEL_MIXTRAL_8X7B_DOWNLOAD_TO_HOST=yes --env.CM_MLPERF_DATASET_MIXTRAL_8X7B_DOWNLOAD_TO_HOST=yes --adr.openorca-mbxp-gsm8k-combined-preprocessed.tags=_size.1
cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/mlcommons/mlperf_inference_test_submissions_v5.0 --repo_branch=dev --commit_message="Results from self hosted Github actions - GO-phoenix" --quiet --submission_dir=$HOME/gh_action_submissions
2 changes: 1 addition & 1 deletion .github/workflows/test-mlperf-inference-rgat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }}
- name: Test MLPerf Inference R-GAT using ${{ matrix.backend }} on ${{ matrix.os }}
run: |
cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --pull_changes=yes --pull_inference_changes=yes --submitter="MLCommons" --hw_name=gh_${{ matrix.os }}_x86 --model=rgat --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --category=datacenter --quiet -v --target_qps=1
cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --adr.inference-src.tags=_branch.dev --pull_changes=yes --pull_inference_changes=yes --submitter="MLCommons" --hw_name=gh_${{ matrix.os }}_x86 --model=rgat --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --category=datacenter --quiet -v --target_qps=1
- name: Push Results
if: github.repository_owner == 'gateoverflow'
env:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: MLPerf Inference Nvidia implementations

on:
schedule:
- cron: "08 01 * * */3" #to be adjusted
- cron: "58 10 * * *" #to be adjusted

jobs:
run_nvidia:
Expand All @@ -17,20 +17,31 @@ jobs:
strategy:
fail-fast: false
matrix:
system: [ "GO-spr", "phoenix-Amd-Am5", "GO-i9" ]
# system: [ "GO-spr", "phoenix-Amd-Am5", "GO-i9", "mlc-server" ]
system: [ "mlc-server" ]
python-version: [ "3.12" ]
model: [ "resnet50", "retinanet", "bert-99", "bert-99.9", "gptj-99.9", "3d-unet-99.9", "sdxl" ]
exclude:
- model: gptj-99.9

steps:
- name: Test MLPerf Inference NVIDIA ${{ matrix.model }}
env:
gpu_name: rtx_4090
run: |
# Set hw_name based on matrix.system
if [ "${{ matrix.system }}" = "GO-spr" ]; then
hw_name="RTX4090x2"
gpu_name=rtx_4090
docker_string=" --docker"
elif [ "${{ matrix.system }}" = "mlc-server" ]; then
hw_name="H100x8"
gpu_name=h100
docker_string=" "
else
hw_name="RTX4090x1"
gpu_name=rtx_4090
docker_string=" --docker"
fi

if [ -f "gh_action/bin/deactivate" ]; then source gh_action/bin/deactivate; fi
Expand All @@ -40,6 +51,6 @@ jobs:
pip install --upgrade cm4mlops
cm pull repo

cm run script --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r4.1-dev --preprocess_submission=yes --pull_changes=yes --pull_inference_changes=yes --execution_mode=valid --gpu_name=rtx_4090 --pull_changes=yes --pull_inference_changes=yes --model=${{ matrix.model }} --submitter="MLCommons" --hw_name=$hw_name --implementation=nvidia --backend=tensorrt --category=datacenter,edge --division=closed --docker_dt=yes --docker_it=no --docker_cm_repo=mlcommons@mlperf-automations --docker_cm_repo_branch=dev --adr.compiler.tags=gcc --device=cuda --use_model_from_host=yes --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean --docker --quiet
cm run script --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r4.1-dev --preprocess_submission=yes --pull_changes=yes --pull_inference_changes=yes --execution_mode=valid --gpu_name=$gpu_name --pull_changes=yes --pull_inference_changes=yes --model=${{ matrix.model }} --submitter="MLCommons" --hw_name=$hw_name --implementation=nvidia --backend=tensorrt --category=datacenter,edge --division=closed --docker_dt=yes --docker_it=no --docker_cm_repo=mlcommons@mlperf-automations --docker_cm_repo_branch=dev --adr.compiler.tags=gcc --device=cuda --use_model_from_host=yes --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean $docker_string --quiet

cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/mlcommons/mlperf_inference_unofficial_submissions_v5.0 --repo_branch=auto-update --commit_message="Results from GH action on NVIDIA_$hw_name" --quiet --submission_dir=$HOME/gh_action_submissions --hw_name=$hw_name
7 changes: 6 additions & 1 deletion script/app-mlperf-inference/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1793,7 +1793,12 @@ update_meta_if_env:
use_host_group_id: True
use_host_user_id: True
pass_user_group: True #useful if docker is run by a different user from the one who built it and under the same group

- enable_if_env:
CM_HOST_OS_TYPE:
- linux
adr:
compiler:
tags: gcc

docker:
deps:
Expand Down
3 changes: 3 additions & 0 deletions script/build-docker-image/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ input_mapping:
new_env_keys:
- CM_DOCKER_*

deps:
- tags: get,docker

prehook_deps:
- enable_if_env:
CM_BUILD_DOCKERFILE:
Expand Down
3 changes: 2 additions & 1 deletion script/build-docker-image/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ def preprocess(i):

# Prepare CMD to build image
XCMD = [
'docker build ' + env.get('CM_DOCKER_CACHE_ARG', ''),
f'{env["CM_CONTAINER_TOOL"]} build ' +
env.get('CM_DOCKER_CACHE_ARG', ''),
' ' + build_args,
' -f "' + dockerfile_path + '"',
' -t "' + image_name,
Expand Down
4 changes: 3 additions & 1 deletion script/run-docker-container/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ input_mapping:
new_env_keys:
- 'CM_DOCKER_CONTAINER_ID'

deps:
- tags: get,docker

prehook_deps:
- names:
- build-docker-image
Expand All @@ -69,4 +72,3 @@ prehook_deps:
CM_DOCKER_CONTAINER_ID:
- on
tags: build,docker,image
- tags: get,docker
30 changes: 15 additions & 15 deletions script/run-docker-container/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def preprocess(i):
print('')
print('Checking existing Docker container:')
print('')
CMD = f"""docker ps --filter "ancestor={DOCKER_CONTAINER}" """
CMD = f"""{env['CM_CONTAINER_TOOL']} ps --filter "ancestor={DOCKER_CONTAINER}" """
if os_info['platform'] == 'windows':
CMD += " 2> nul"
else:
Expand All @@ -78,7 +78,7 @@ def preprocess(i):
if env.get('CM_DOCKER_CONTAINER_ID', '') != '':
del (env['CM_DOCKER_CONTAINER_ID']) # not valid ID

CMD = "docker images -q " + DOCKER_CONTAINER
CMD = f"""{env['CM_CONTAINER_TOOL']} images -q """ + DOCKER_CONTAINER

if os_info['platform'] == 'windows':
CMD += " 2> nul"
Expand Down Expand Up @@ -196,11 +196,6 @@ def postprocess(i):
return {'return': 1, 'error': 'Can\'t find separator : in a mount string: {}'.format(
mount_cmd)}

# mount_parts = mount_cmd.split(":")
# if len(mount_parts) != 2:
# return {'return': 1, 'error': 'Invalid mount {}
# specified'.format(mount_parts)}

host_mount = mount_parts[0]

if not os.path.exists(host_mount):
Expand Down Expand Up @@ -240,14 +235,14 @@ def postprocess(i):

existing_container_id = env.get('CM_DOCKER_CONTAINER_ID', '')
if existing_container_id:
CMD = f"ID={existing_container_id} && docker exec $ID bash -c '" + run_cmd + "'"
CMD = f"""ID={existing_container_id} && {env['CM_CONTAINER_TOOL']} exec $ID bash -c '""" + run_cmd + "'"
else:
CONTAINER = f"docker run -dt {run_opts} --rm {docker_image_repo}/{docker_image_name}:{docker_image_tag} bash"
CMD = f"ID=`{CONTAINER}` && docker exec $ID bash -c '{run_cmd}'"
CONTAINER = f"""{env['CM_CONTAINER_TOOL']} run -dt {run_opts} --rm {docker_image_repo}/{docker_image_name}:{docker_image_tag} bash"""
CMD = f"""ID=`{CONTAINER}` && {env['CM_CONTAINER_TOOL']} exec $ID bash -c '{run_cmd}'"""

if False and str(env.get('CM_KEEP_DETACHED_CONTAINER', '')).lower() not in [
'yes', "1", 'true']:
CMD += " && docker kill $ID >/dev/null"
CMD += f""" && {env['CM_CONTAINER_TOOL']} kill $ID >/dev/null"""

CMD += ' && echo "ID=$ID"'

Expand All @@ -256,7 +251,10 @@ def postprocess(i):
print('')
print(CMD)
print('')
print("Running " + run_cmd + " inside docker container")
print(
"Running " +
run_cmd +
f""" inside {env['CM_CONTAINER_TOOL']} container""")

record_script({'cmd': CMD, 'env': env})

Expand All @@ -280,7 +278,8 @@ def postprocess(i):

docker_out = result.stdout
# if docker_out != 0:
# return {'return': docker_out, 'error': 'docker run failed'}
# return {'return': docker_out, 'error': f""{env['CM_CONTAINER_TOOL']}
# run failed""}

lines = docker_out.split("\n")

Expand All @@ -304,7 +303,7 @@ def postprocess(i):
x1 = '-it'
x2 = " && bash ) || bash"

CONTAINER = "docker run " + x1 + " --entrypoint " + x + x + " " + run_opts + \
CONTAINER = f"{env['CM_CONTAINER_TOOL']} run " + x1 + " --entrypoint " + x + x + " " + run_opts + \
" " + docker_image_repo + "/" + docker_image_name + ":" + docker_image_tag
CMD = CONTAINER + " bash -c " + x + run_cmd_prefix + run_cmd + x2 + x

Expand All @@ -320,7 +319,8 @@ def postprocess(i):
if docker_out != 0:
if docker_out % 256 == 0:
docker_out = 1
return {'return': docker_out, 'error': 'docker run failed'}
return {'return': docker_out,
'error': f"""{env['CM_CONTAINER_TOOL']} run failed"""}

return {'return': 0}

Expand Down
1 change: 1 addition & 0 deletions script/run-mlperf-inference-app/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ default_env:
CM_MLPERF_RUN_STYLE: test
CM_MLPERF_SKIP_SUBMISSION_GENERATION: no
CM_DOCKER_PRIVILEGED_MODE: yes
CM_MLPERF_SUBMISSION_DIVISION: open

input_mapping:
api_server: CM_MLPERF_INFERENCE_API_SERVER
Expand Down
Loading