diff --git a/.github/workflows/test-mlperf-inference-mixtral.yml b/.github/workflows/test-mlperf-inference-mixtral.yml index 04a944fd4..6687ff048 100644 --- a/.github/workflows/test-mlperf-inference-mixtral.yml +++ b/.github/workflows/test-mlperf-inference-mixtral.yml @@ -31,5 +31,5 @@ jobs: git config --global credential.helper store huggingface-cli login --token ${{ secrets.HF_TOKEN }} --add-to-git-credential cm pull repo - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --pull_changes=yes --pull_inference_changes=yes --model=mixtral-8x7b --implementation=reference --batch_size=1 --precision=${{ matrix.precision }} --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker_it=no --docker_cm_repo=gateoverflow@mlperf-automations --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --docker --quiet --test_query_count=3 --target_qps=0.001 --clean --env.CM_MLPERF_MODEL_MIXTRAL_8X7B_DOWNLOAD_TO_HOST=yes --env.CM_MLPERF_DATASET_MIXTRAL_8X7B_DOWNLOAD_TO_HOST=yes --adr.openorca-mbxp-gsm8k-combined-preprocessed.tags=_size.1 + cm run script --tags=run-mlperf,inference,_submission,_short --adr.inference-src.tags=_branch.dev --submitter="MLCommons" --pull_changes=yes --pull_inference_changes=yes --model=mixtral-8x7b --implementation=reference --batch_size=1 --precision=${{ matrix.precision }} --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker_it=no --docker_cm_repo=gateoverflow@mlperf-automations --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --docker --quiet --test_query_count=3 --target_qps=0.001 --clean --env.CM_MLPERF_MODEL_MIXTRAL_8X7B_DOWNLOAD_TO_HOST=yes --env.CM_MLPERF_DATASET_MIXTRAL_8X7B_DOWNLOAD_TO_HOST=yes --adr.openorca-mbxp-gsm8k-combined-preprocessed.tags=_size.1 cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/mlcommons/mlperf_inference_test_submissions_v5.0 --repo_branch=dev --commit_message="Results from self hosted Github actions - GO-phoenix" --quiet --submission_dir=$HOME/gh_action_submissions diff --git a/.github/workflows/test-mlperf-inference-rgat.yml b/.github/workflows/test-mlperf-inference-rgat.yml index 94a2e174b..62d5a5139 100644 --- a/.github/workflows/test-mlperf-inference-rgat.yml +++ b/.github/workflows/test-mlperf-inference-rgat.yml @@ -31,7 +31,7 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference R-GAT using ${{ matrix.backend }} on ${{ matrix.os }} run: | - cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --pull_changes=yes --pull_inference_changes=yes --submitter="MLCommons" --hw_name=gh_${{ matrix.os }}_x86 --model=rgat --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --category=datacenter --quiet -v --target_qps=1 + cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --adr.inference-src.tags=_branch.dev --pull_changes=yes --pull_inference_changes=yes --submitter="MLCommons" --hw_name=gh_${{ matrix.os }}_x86 --model=rgat --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --category=datacenter --quiet -v --target_qps=1 - name: Push Results if: github.repository_owner == 'gateoverflow' env: diff --git a/.github/workflows/test-nvidia-mlperf-inference-implementations.yml b/.github/workflows/test-nvidia-mlperf-inference-implementations.yml index 2bbccae6b..bb83bd49a 100644 --- a/.github/workflows/test-nvidia-mlperf-inference-implementations.yml +++ b/.github/workflows/test-nvidia-mlperf-inference-implementations.yml @@ -2,7 +2,7 @@ name: MLPerf Inference Nvidia implementations on: schedule: - - cron: "08 01 * * */3" #to be adjusted + - cron: "58 10 * * *" #to be adjusted jobs: run_nvidia: @@ -17,7 +17,8 @@ jobs: strategy: fail-fast: false matrix: - system: [ "GO-spr", "phoenix-Amd-Am5", "GO-i9" ] + # system: [ "GO-spr", "phoenix-Amd-Am5", "GO-i9", "mlc-server" ] + system: [ "mlc-server" ] python-version: [ "3.12" ] model: [ "resnet50", "retinanet", "bert-99", "bert-99.9", "gptj-99.9", "3d-unet-99.9", "sdxl" ] exclude: @@ -25,12 +26,22 @@ jobs: steps: - name: Test MLPerf Inference NVIDIA ${{ matrix.model }} + env: + gpu_name: rtx_4090 run: | # Set hw_name based on matrix.system if [ "${{ matrix.system }}" = "GO-spr" ]; then hw_name="RTX4090x2" + gpu_name=rtx_4090 + docker_string=" --docker" + elif [ "${{ matrix.system }}" = "mlc-server" ]; then + hw_name="H100x8" + gpu_name=h100 + docker_string=" " else hw_name="RTX4090x1" + gpu_name=rtx_4090 + docker_string=" --docker" fi if [ -f "gh_action/bin/deactivate" ]; then source gh_action/bin/deactivate; fi @@ -40,6 +51,6 @@ jobs: pip install --upgrade cm4mlops cm pull repo - cm run script --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r4.1-dev --preprocess_submission=yes --pull_changes=yes --pull_inference_changes=yes --execution_mode=valid --gpu_name=rtx_4090 --pull_changes=yes --pull_inference_changes=yes --model=${{ matrix.model }} --submitter="MLCommons" --hw_name=$hw_name --implementation=nvidia --backend=tensorrt --category=datacenter,edge --division=closed --docker_dt=yes --docker_it=no --docker_cm_repo=mlcommons@mlperf-automations --docker_cm_repo_branch=dev --adr.compiler.tags=gcc --device=cuda --use_model_from_host=yes --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean --docker --quiet + cm run script --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r4.1-dev --preprocess_submission=yes --pull_changes=yes --pull_inference_changes=yes --execution_mode=valid --gpu_name=$gpu_name --pull_changes=yes --pull_inference_changes=yes --model=${{ matrix.model }} --submitter="MLCommons" --hw_name=$hw_name --implementation=nvidia --backend=tensorrt --category=datacenter,edge --division=closed --docker_dt=yes --docker_it=no --docker_cm_repo=mlcommons@mlperf-automations --docker_cm_repo_branch=dev --adr.compiler.tags=gcc --device=cuda --use_model_from_host=yes --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean $docker_string --quiet cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/mlcommons/mlperf_inference_unofficial_submissions_v5.0 --repo_branch=auto-update --commit_message="Results from GH action on NVIDIA_$hw_name" --quiet --submission_dir=$HOME/gh_action_submissions --hw_name=$hw_name diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml index 180bad643..6c939a225 100644 --- a/script/app-mlperf-inference/_cm.yaml +++ b/script/app-mlperf-inference/_cm.yaml @@ -1793,7 +1793,12 @@ update_meta_if_env: use_host_group_id: True use_host_user_id: True pass_user_group: True #useful if docker is run by a different user from the one who built it and under the same group - + - enable_if_env: + CM_HOST_OS_TYPE: + - linux + adr: + compiler: + tags: gcc docker: deps: diff --git a/script/build-docker-image/_cm.yaml b/script/build-docker-image/_cm.yaml index a9dc8cb67..8fd7c2571 100644 --- a/script/build-docker-image/_cm.yaml +++ b/script/build-docker-image/_cm.yaml @@ -38,6 +38,9 @@ input_mapping: new_env_keys: - CM_DOCKER_* +deps: + - tags: get,docker + prehook_deps: - enable_if_env: CM_BUILD_DOCKERFILE: diff --git a/script/build-docker-image/customize.py b/script/build-docker-image/customize.py index b8956de6c..6d971f804 100644 --- a/script/build-docker-image/customize.py +++ b/script/build-docker-image/customize.py @@ -66,7 +66,8 @@ def preprocess(i): # Prepare CMD to build image XCMD = [ - 'docker build ' + env.get('CM_DOCKER_CACHE_ARG', ''), + f'{env["CM_CONTAINER_TOOL"]} build ' + + env.get('CM_DOCKER_CACHE_ARG', ''), ' ' + build_args, ' -f "' + dockerfile_path + '"', ' -t "' + image_name, diff --git a/script/run-docker-container/_cm.yaml b/script/run-docker-container/_cm.yaml index 253a48c91..78f8397d2 100644 --- a/script/run-docker-container/_cm.yaml +++ b/script/run-docker-container/_cm.yaml @@ -58,6 +58,9 @@ input_mapping: new_env_keys: - 'CM_DOCKER_CONTAINER_ID' +deps: + - tags: get,docker + prehook_deps: - names: - build-docker-image @@ -69,4 +72,3 @@ prehook_deps: CM_DOCKER_CONTAINER_ID: - on tags: build,docker,image -- tags: get,docker diff --git a/script/run-docker-container/customize.py b/script/run-docker-container/customize.py index 2157d595f..68acccab6 100644 --- a/script/run-docker-container/customize.py +++ b/script/run-docker-container/customize.py @@ -51,7 +51,7 @@ def preprocess(i): print('') print('Checking existing Docker container:') print('') - CMD = f"""docker ps --filter "ancestor={DOCKER_CONTAINER}" """ + CMD = f"""{env['CM_CONTAINER_TOOL']} ps --filter "ancestor={DOCKER_CONTAINER}" """ if os_info['platform'] == 'windows': CMD += " 2> nul" else: @@ -78,7 +78,7 @@ def preprocess(i): if env.get('CM_DOCKER_CONTAINER_ID', '') != '': del (env['CM_DOCKER_CONTAINER_ID']) # not valid ID - CMD = "docker images -q " + DOCKER_CONTAINER + CMD = f"""{env['CM_CONTAINER_TOOL']} images -q """ + DOCKER_CONTAINER if os_info['platform'] == 'windows': CMD += " 2> nul" @@ -196,11 +196,6 @@ def postprocess(i): return {'return': 1, 'error': 'Can\'t find separator : in a mount string: {}'.format( mount_cmd)} -# mount_parts = mount_cmd.split(":") -# if len(mount_parts) != 2: -# return {'return': 1, 'error': 'Invalid mount {} -# specified'.format(mount_parts)} - host_mount = mount_parts[0] if not os.path.exists(host_mount): @@ -240,14 +235,14 @@ def postprocess(i): existing_container_id = env.get('CM_DOCKER_CONTAINER_ID', '') if existing_container_id: - CMD = f"ID={existing_container_id} && docker exec $ID bash -c '" + run_cmd + "'" + CMD = f"""ID={existing_container_id} && {env['CM_CONTAINER_TOOL']} exec $ID bash -c '""" + run_cmd + "'" else: - CONTAINER = f"docker run -dt {run_opts} --rm {docker_image_repo}/{docker_image_name}:{docker_image_tag} bash" - CMD = f"ID=`{CONTAINER}` && docker exec $ID bash -c '{run_cmd}'" + CONTAINER = f"""{env['CM_CONTAINER_TOOL']} run -dt {run_opts} --rm {docker_image_repo}/{docker_image_name}:{docker_image_tag} bash""" + CMD = f"""ID=`{CONTAINER}` && {env['CM_CONTAINER_TOOL']} exec $ID bash -c '{run_cmd}'""" if False and str(env.get('CM_KEEP_DETACHED_CONTAINER', '')).lower() not in [ 'yes', "1", 'true']: - CMD += " && docker kill $ID >/dev/null" + CMD += f""" && {env['CM_CONTAINER_TOOL']} kill $ID >/dev/null""" CMD += ' && echo "ID=$ID"' @@ -256,7 +251,10 @@ def postprocess(i): print('') print(CMD) print('') - print("Running " + run_cmd + " inside docker container") + print( + "Running " + + run_cmd + + f""" inside {env['CM_CONTAINER_TOOL']} container""") record_script({'cmd': CMD, 'env': env}) @@ -280,7 +278,8 @@ def postprocess(i): docker_out = result.stdout # if docker_out != 0: - # return {'return': docker_out, 'error': 'docker run failed'} + # return {'return': docker_out, 'error': f""{env['CM_CONTAINER_TOOL']} + # run failed""} lines = docker_out.split("\n") @@ -304,7 +303,7 @@ def postprocess(i): x1 = '-it' x2 = " && bash ) || bash" - CONTAINER = "docker run " + x1 + " --entrypoint " + x + x + " " + run_opts + \ + CONTAINER = f"{env['CM_CONTAINER_TOOL']} run " + x1 + " --entrypoint " + x + x + " " + run_opts + \ " " + docker_image_repo + "/" + docker_image_name + ":" + docker_image_tag CMD = CONTAINER + " bash -c " + x + run_cmd_prefix + run_cmd + x2 + x @@ -320,7 +319,8 @@ def postprocess(i): if docker_out != 0: if docker_out % 256 == 0: docker_out = 1 - return {'return': docker_out, 'error': 'docker run failed'} + return {'return': docker_out, + 'error': f"""{env['CM_CONTAINER_TOOL']} run failed"""} return {'return': 0} diff --git a/script/run-mlperf-inference-app/_cm.yaml b/script/run-mlperf-inference-app/_cm.yaml index a2e3688e7..ea2185060 100644 --- a/script/run-mlperf-inference-app/_cm.yaml +++ b/script/run-mlperf-inference-app/_cm.yaml @@ -34,6 +34,7 @@ default_env: CM_MLPERF_RUN_STYLE: test CM_MLPERF_SKIP_SUBMISSION_GENERATION: no CM_DOCKER_PRIVILEGED_MODE: yes + CM_MLPERF_SUBMISSION_DIVISION: open input_mapping: api_server: CM_MLPERF_INFERENCE_API_SERVER