From a7bd3149e552d17c3df55622b4ff350ebebca14f Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 14 Feb 2025 09:19:42 +0000 Subject: [PATCH 1/5] Update test-nvidia-mlperf-inference-implementations.yml --- .../workflows/test-nvidia-mlperf-inference-implementations.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-nvidia-mlperf-inference-implementations.yml b/.github/workflows/test-nvidia-mlperf-inference-implementations.yml index 0cab69c89..415d697b7 100644 --- a/.github/workflows/test-nvidia-mlperf-inference-implementations.yml +++ b/.github/workflows/test-nvidia-mlperf-inference-implementations.yml @@ -58,7 +58,7 @@ jobs: pip install --upgrade mlcflow mlc pull repo mlcommons@mlperf-automations --branch=dev - mlcr --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r5.0-dev --preprocess_submission=yes --pull_changes=yes --pull_inference_changes=yes --execution_mode=valid --gpu_name=$gpu_name --pull_changes=yes --pull_inference_changes=yes --model=${{ matrix.model }} --submitter="GATEOverflow " --hw_name=$hw_name --implementation=nvidia --backend=tensorrt --category=$category --division=closed --docker_dt --docker_mlc_repo=mlcommons@mlperf-automations --docker_mlc_repo_branch=dev --adr.compiler.tags=gcc --device=cuda --use_model_from_host=yes --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean $docker_string $submission_preprocessor_args --quiet + mlcr --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r5.0-dev --preprocess_submission=yes --pull_changes=yes --pull_inference_changes=yes --execution_mode=valid --gpu_name=$gpu_name --pull_changes=yes --pull_inference_changes=yes --model=${{ matrix.model }} --submitter="GATEOverflow" --hw_name=$hw_name --implementation=nvidia --backend=tensorrt --category=$category --division=closed --docker_dt --docker_mlc_repo=mlcommons@mlperf-automations --docker_mlc_repo_branch=dev --adr.compiler.tags=gcc --device=cuda --use_model_from_host=yes --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean $docker_string $submission_preprocessor_args --quiet #mlcr --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/mlcommons/mlperf_inference_unofficial_submissions_v5.0 --repo_branch=auto-update --commit_message="Results from GH action on NVIDIA_$hw_name" --quiet --submission_dir=$HOME/gh_action_submissions --hw_name=$hw_name mlcr --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/GATEOverflow/mlperf_inference_submissions_v5.0 --repo_branch=main --commit_message="Results from GH actions on NVIDIA_$hw_name" --quiet --submission_dir=$HOME/gh_action_submissions --hw_name=$hw_name From 968f1a67f9a57254b6cef9a30f410644d462f261 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 14 Feb 2025 14:53:25 +0530 Subject: [PATCH 2/5] Strip mlperf inference submitter name --- script/generate-mlperf-inference-submission/customize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/generate-mlperf-inference-submission/customize.py b/script/generate-mlperf-inference-submission/customize.py index 0d1ac5a69..9bbe9b2d0 100644 --- a/script/generate-mlperf-inference-submission/customize.py +++ b/script/generate-mlperf-inference-submission/customize.py @@ -150,7 +150,7 @@ def generate_submission(env, state, inp, submission_division): # Check submitter if env.get('MLC_MLPERF_SUBMITTER'): - submitter = env['MLC_MLPERF_SUBMITTER'] + submitter = env['MLC_MLPERF_SUBMITTER'].strip() system_meta_tmp['submitter'] = submitter else: submitter = system_meta_default['submitter'] From 403724930876a6d5cbf1808b1c72137c6ddfb777 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 14 Feb 2025 11:20:39 +0000 Subject: [PATCH 3/5] Update test-nvidia-mlperf-inference-implementations.yml --- .../workflows/test-nvidia-mlperf-inference-implementations.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-nvidia-mlperf-inference-implementations.yml b/.github/workflows/test-nvidia-mlperf-inference-implementations.yml index 415d697b7..2fac2632c 100644 --- a/.github/workflows/test-nvidia-mlperf-inference-implementations.yml +++ b/.github/workflows/test-nvidia-mlperf-inference-implementations.yml @@ -2,7 +2,7 @@ name: MLPerf Inference Nvidia implementations on: schedule: - - cron: "05 01 * * *" + - cron: "27 11 * * *" jobs: run_nvidia: From 124c3dd7ca024977b100cae21e4b9e090184a32b Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 14 Feb 2025 18:32:44 +0530 Subject: [PATCH 4/5] Fix llama2 mlperf variants --- script/get-ml-model-llama2/meta.yaml | 29 +++++++++++++++++++++--- script/get-ml-model-llama2/run-rclone.sh | 5 +++- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/script/get-ml-model-llama2/meta.yaml b/script/get-ml-model-llama2/meta.yaml index 0b0819678..a816ab604 100644 --- a/script/get-ml-model-llama2/meta.yaml +++ b/script/get-ml-model-llama2/meta.yaml @@ -93,23 +93,46 @@ variations: group: download-source env: MLC_DOWNLOAD_SRC: huggingface + 70b: + env: + MLC_GIT_CHECKOUT_FOLDER: Llama-2-70b-chat-hf + group: model-size + default: true + default_variations: + huggingface-stub: meta-llama/Llama-2-70b-chat-hf + 7b: + env: + MLC_GIT_CHECKOUT_FOLDER: Llama-2-7b-chat-hf + group: model-size + default_variations: + huggingface-stub: meta-llama/Llama-2-7b-chat-hf + + 70b-fused-qkv: + env: + MLC_GIT_CHECKOUT_FOLDER: Llama-2-70b-fused-qkv-mlperf + group: model-size + meta-llama/Llama-2-70b-chat-hf: + base: + - 70b adr: hf-zoo: tags: _model-stub.meta-llama/Llama-2-70b-chat-hf - default: true env: - MLC_GIT_CHECKOUT_FOLDER: Llama-2-70b-chat-hf MLC_MODEL_ZOO_ENV_KEY: LLAMA2 group: huggingface-stub + meta-llama/Llama-2-7b-chat-hf: + base: + - 7b adr: hf-zoo: tags: _model-stub.meta-llama/Llama-2-7b-chat-hf env: - MLC_GIT_CHECKOUT_FOLDER: Llama-2-7b-chat-hf MLC_MODEL_ZOO_ENV_KEY: LLAMA2 group: huggingface-stub + + nvidia: default_variations: framework: pytorch diff --git a/script/get-ml-model-llama2/run-rclone.sh b/script/get-ml-model-llama2/run-rclone.sh index 0d56e5eeb..7daee2da8 100644 --- a/script/get-ml-model-llama2/run-rclone.sh +++ b/script/get-ml-model-llama2/run-rclone.sh @@ -1,3 +1,6 @@ rclone config create mlc-llama2 drive config_is_local=false scope=drive.readonly root_folder_id=11tBZvvrh0FCm3XuR5E849K42TqftYdUF rclone config reconnect mlc-llama2: -rclone sync mlc-llama2:${MLC_GIT_CHECKOUT_FOLDER} ${LLAMA2_CHECKPOINT_PATH}/${MLC_GIT_CHECKOUT_FOLDER} -P +cmd="rclone sync mlc-llama2:${MLC_GIT_CHECKOUT_FOLDER} ${LLAMA2_CHECKPOINT_PATH}/${MLC_GIT_CHECKOUT_FOLDER} -P" +echo $cmd +eval $cmd +test $? -eq 0 || exit $? From 19ab479626b19cc8849f5323ff47f7726c73c6fd Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 14 Feb 2025 22:42:34 +0530 Subject: [PATCH 5/5] Support lama3.1 mlperf download --- .gitignore | 8 ++++-- script/get-ml-model-llama2/meta.yaml | 3 ++ script/get-ml-model-llama2/run-rclone.sh | 2 -- script/get-ml-model-llama3/customize.py | 6 ++++ script/get-ml-model-llama3/meta.yaml | 35 ++++++++++++++++++++++-- script/get-ml-model-llama3/run-rclone.sh | 4 +++ script/get-rclone-config/customize.py | 8 +++++- script/get-rclone-config/meta.yaml | 8 ++++++ 8 files changed, 66 insertions(+), 8 deletions(-) create mode 100644 script/get-ml-model-llama3/run-rclone.sh diff --git a/.gitignore b/.gitignore index 96523fae4..29951b3fb 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,10 @@ wheels/ .coverage htmlcov *tmp/ -*tmp-ck-*/ +tmp-* local/cache/ - +mlc-log.txt +repos.json +index_script.json +index_cache.json +index_experiment.json diff --git a/script/get-ml-model-llama2/meta.yaml b/script/get-ml-model-llama2/meta.yaml index a816ab604..ed7477c04 100644 --- a/script/get-ml-model-llama2/meta.yaml +++ b/script/get-ml-model-llama2/meta.yaml @@ -89,6 +89,9 @@ variations: default: true env: MLC_DOWNLOAD_SRC: mlcommons + prehook_deps: + - tags: get,rclone-config,_mlperf-llama2 + force_cache: yes hf: group: download-source env: diff --git a/script/get-ml-model-llama2/run-rclone.sh b/script/get-ml-model-llama2/run-rclone.sh index 7daee2da8..1fc602a9f 100644 --- a/script/get-ml-model-llama2/run-rclone.sh +++ b/script/get-ml-model-llama2/run-rclone.sh @@ -1,5 +1,3 @@ -rclone config create mlc-llama2 drive config_is_local=false scope=drive.readonly root_folder_id=11tBZvvrh0FCm3XuR5E849K42TqftYdUF -rclone config reconnect mlc-llama2: cmd="rclone sync mlc-llama2:${MLC_GIT_CHECKOUT_FOLDER} ${LLAMA2_CHECKPOINT_PATH}/${MLC_GIT_CHECKOUT_FOLDER} -P" echo $cmd eval $cmd diff --git a/script/get-ml-model-llama3/customize.py b/script/get-ml-model-llama3/customize.py index 2429a1e92..17ebedabd 100644 --- a/script/get-ml-model-llama3/customize.py +++ b/script/get-ml-model-llama3/customize.py @@ -20,6 +20,12 @@ def preprocess(i): env['MLC_GIT_CHECKOUT_FOLDER'] = os.path.join( path, env['MLC_ML_MODEL_NAME']) + if env['MLC_DOWNLOAD_SRC'] == "mlcommons": + i['run_script_input']['script_name'] = 'run-rclone' + if env.get('MLC_OUTDIRNAME', '') != '': + env['LLAMA3_CHECKPOINT_PATH'] = env['MLC_OUTDIRNAME'] + else: + env['LLAMA3_CHECKPOINT_PATH'] = os.getcwd() env['MLC_TMP_REQUIRE_DOWNLOAD'] = 'yes' return {'return': 0} diff --git a/script/get-ml-model-llama3/meta.yaml b/script/get-ml-model-llama3/meta.yaml index f5432f3ee..673f34c49 100644 --- a/script/get-ml-model-llama3/meta.yaml +++ b/script/get-ml-model-llama3/meta.yaml @@ -12,6 +12,8 @@ prehook_deps: - enable_if_env: MLC_TMP_REQUIRE_DOWNLOAD: - 'yes' + MLC_DOWNLOAD_SRC: + - huggingface env: {} extra_cache_tags: llama3,llama-3 force_env_keys: @@ -37,21 +39,48 @@ variations: MLC_ML_MODEL_PRECISION: fp16 MLC_ML_MODEL_WEIGHT_DATA_TYPES: fp16 group: precision + 405b: + group: model-size + default: true + env: + MLC_ML_MODEL_NAME: Llama-3.1-405B-Instruct + 8b: + group: model-size + env: + MLC_ML_MODEL_NAME: Llama-3.1-8b-Instruct + mlc: + group: download-src + default: true + prehook_deps: + - tags: get,rclone-config,_mlperf-llama3-1 + force_cache: true + env: + MLC_DOWNLOAD_SRC: mlcommons + hf: + group: download-src + default_variations: + huggingface-stub: meta-llama/Llama-3.1-405B-Instruct + env: + MLC_DOWNLOAD_SRC: huggingface + meta-llama/Llama-3.1-405B-Instruct: + base: + - 405b adr: hf-zoo: tags: _model-stub.meta-llama/Llama-3.1-405B-Instruct - default: true env: - MLC_ML_MODEL_NAME: Llama-3-405b-instruct + MLC_ML_MODEL_NAME: Llama-3.1-405B-Instruct MLC_MODEL_ZOO_ENV_KEY: LLAMA3 group: huggingface-stub meta-llama/Llama-3.1-8B-Instruct: + base: + - 8b adr: hf-zoo: tags: _model-stub.meta-llama/Llama-3.1-8B-Instruct env: - MLC_ML_MODEL_NAME: Llama-3-8b-instruct + MLC_ML_MODEL_NAME: Llama-3.1-8b-Instruct MLC_MODEL_ZOO_ENV_KEY: LLAMA3 group: huggingface-stub vllm: diff --git a/script/get-ml-model-llama3/run-rclone.sh b/script/get-ml-model-llama3/run-rclone.sh new file mode 100644 index 000000000..e1943a00f --- /dev/null +++ b/script/get-ml-model-llama3/run-rclone.sh @@ -0,0 +1,4 @@ +cmd="rclone sync mlc-llama3-1:inference/${MLC_ML_MODEL_NAME} ${LLAMA3_CHECKPOINT_PATH}/${MLC_ML_MODEL_NAME} -P" +echo $cmd +eval $cmd +test $? -eq 0 || exit $? diff --git a/script/get-rclone-config/customize.py b/script/get-rclone-config/customize.py index f90b972bc..719bf7856 100644 --- a/script/get-rclone-config/customize.py +++ b/script/get-rclone-config/customize.py @@ -14,8 +14,14 @@ def preprocess(i): quiet = (env.get('MLC_QUIET', False) == 'yes') + run_cmds = [] if env.get('MLC_RCLONE_CONFIG_CMD', '') != '': - env['MLC_RUN_CMD'] = env['MLC_RCLONE_CONFIG_CMD'] + run_cmds.append(env['MLC_RCLONE_CONFIG_CMD']) + + if env.get('MLC_RCLONE_CONNECT_CMD', '') != '': + run_cmds.append(env['MLC_RCLONE_CONNECT_CMD']) + + env['MLC_RUN_CMD'] = ' && '.join(run_cmds) return {'return': 0} diff --git a/script/get-rclone-config/meta.yaml b/script/get-rclone-config/meta.yaml index 8ebbe168a..8cc949d25 100644 --- a/script/get-rclone-config/meta.yaml +++ b/script/get-rclone-config/meta.yaml @@ -11,3 +11,11 @@ variations: mlc-inference: env: MLC_RCLONE_CONFIG_CMD: 'rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com' + mlperf-llama2: + env: + MLC_RCLONE_CONFIG_CMD: 'rclone config create mlc-llama2 drive config_is_local=false scope=drive.readonly root_folder_id=11tBZvvrh0FCm3XuR5E849K42TqftYdUF' + MLC_RCLONE_CONNECT_CMD: 'rclone config reconnect mlc-llama2:' + mlperf-llama3-1: + env: + MLC_RCLONE_CONFIG_CMD: 'rclone config create mlc-llama3-1 drive config_is_local=false scope=drive.readonly root_folder_id=12K-2yvmr1ZSZ7SLrhidCbWc0BriN98am' + MLC_RCLONE_CONNECT_CMD: 'rclone config reconnect mlc-llama3-1:'