From a91f4c49bbbf6ce5a11ddc1c93f88df2d57ce503 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Fri, 9 May 2025 11:59:27 +0200 Subject: [PATCH 01/10] mistralai/Mixtral-8x7B-Instruct-v0.1: add rocm accuracy server override --- mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml diff --git a/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml b/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml new file mode 100644 index 0000000..bfc5627 --- /dev/null +++ b/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml @@ -0,0 +1,4 @@ +# https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 +model: 'mistralai/Mixtral-8x7B-Instruct-v0.1' +trust-remote-code: true +max-model-len: 16384 From 0640959f9e298c18393ac1ca27c29f8c694ea23e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Fri, 9 May 2025 12:09:37 +0200 Subject: [PATCH 02/10] Llama-3.1-8B-Instruct add accuracy/server-rocm. --- meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml diff --git a/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml b/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml new file mode 100644 index 0000000..d06656f --- /dev/null +++ b/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml @@ -0,0 +1,4 @@ +trust-remote-code: true +tensor-parallel-size: 1 +max-model-len: 16384 +gpu_memory_utilization: 0.8 From 049f3b3d62e8894ec2db9776d6b8eb150e3fd7f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Fri, 9 May 2025 15:45:36 +0200 Subject: [PATCH 03/10] meta-llama/Llama-3.1-8B-Instruct: decrease gpu_memory_utilization to prevent OOM (ROCm) --- meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml b/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml index d06656f..c57ac61 100644 --- a/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml +++ b/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml @@ -1,4 +1,4 @@ trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 -gpu_memory_utilization: 0.8 +gpu_memory_utilization: 0.6 From ba9cac20a4d2caa2e9123e64a16889aa2449e45b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Mon, 12 May 2025 12:04:08 +0200 Subject: [PATCH 04/10] mistralai/Mistral-Small-3.1-24B-Instruct-2503: add rocm server override --- .../Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml new file mode 100644 index 0000000..30d16e4 --- /dev/null +++ b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml @@ -0,0 +1 @@ +gpu_memory_utilization: 0.8 From 08894ea5da39c27635880698d964923d177855d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Mon, 12 May 2025 12:50:19 +0200 Subject: [PATCH 05/10] add rocm accuracy overrides for Mistral 24b and Ph --- .../accuracy/server-rocm.yml | 1 + .../accuracy/server-rocm.yml | 1 + RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml | 1 + .../Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml | 1 + 4 files changed, 4 insertions(+) create mode 100644 RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-Dynamic/accuracy/server-rocm.yml create mode 100644 RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/accuracy/server-rocm.yml create mode 100644 RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml create mode 100644 mistralai/Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml diff --git a/RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-Dynamic/accuracy/server-rocm.yml b/RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-Dynamic/accuracy/server-rocm.yml new file mode 100644 index 0000000..30d16e4 --- /dev/null +++ b/RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-Dynamic/accuracy/server-rocm.yml @@ -0,0 +1 @@ +gpu_memory_utilization: 0.8 diff --git a/RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/accuracy/server-rocm.yml b/RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/accuracy/server-rocm.yml new file mode 100644 index 0000000..30d16e4 --- /dev/null +++ b/RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/accuracy/server-rocm.yml @@ -0,0 +1 @@ +gpu_memory_utilization: 0.8 diff --git a/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml new file mode 100644 index 0000000..30d16e4 --- /dev/null +++ b/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml @@ -0,0 +1 @@ +gpu_memory_utilization: 0.8 diff --git a/mistralai/Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml b/mistralai/Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml new file mode 100644 index 0000000..30d16e4 --- /dev/null +++ b/mistralai/Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml @@ -0,0 +1 @@ +gpu_memory_utilization: 0.8 From 736fd47691595bdb812d86e1e4f65f936194f441 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Mon, 12 May 2025 12:56:30 +0200 Subject: [PATCH 06/10] reduce ROCm gpu memory gpu_utilization --- Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml | 1 + .../accuracy/server-rocm.yml | 1 + .../accuracy/server-rocm.yml | 1 + .../accuracy/server-rocm.yml | 1 + .../Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml | 1 + RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml | 2 +- ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml | 1 + .../Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml | 2 +- .../accuracy/server-rocm.yml | 2 +- 9 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml create mode 100644 RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic/accuracy/server-rocm.yml create mode 100644 RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server-rocm.yml create mode 100644 RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server-rocm.yml create mode 100644 RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml create mode 100644 ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml diff --git a/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml b/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml new file mode 100644 index 0000000..7d008a6 --- /dev/null +++ b/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml @@ -0,0 +1 @@ +gpu_memory_utilization: 0.6 diff --git a/RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic/accuracy/server-rocm.yml new file mode 100644 index 0000000..7d008a6 --- /dev/null +++ b/RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic/accuracy/server-rocm.yml @@ -0,0 +1 @@ +gpu_memory_utilization: 0.6 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server-rocm.yml new file mode 100644 index 0000000..7d008a6 --- /dev/null +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server-rocm.yml @@ -0,0 +1 @@ +gpu_memory_utilization: 0.6 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server-rocm.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server-rocm.yml new file mode 100644 index 0000000..7d008a6 --- /dev/null +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server-rocm.yml @@ -0,0 +1 @@ +gpu_memory_utilization: 0.6 diff --git a/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml new file mode 100644 index 0000000..7d008a6 --- /dev/null +++ b/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml @@ -0,0 +1 @@ +gpu_memory_utilization: 0.6 diff --git a/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml index 30d16e4..7d008a6 100644 --- a/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml +++ b/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml @@ -1 +1 @@ -gpu_memory_utilization: 0.8 +gpu_memory_utilization: 0.6 diff --git a/ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml b/ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml new file mode 100644 index 0000000..7d008a6 --- /dev/null +++ b/ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml @@ -0,0 +1 @@ +gpu_memory_utilization: 0.6 diff --git a/mistralai/Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml b/mistralai/Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml index 30d16e4..7d008a6 100644 --- a/mistralai/Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml +++ b/mistralai/Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml @@ -1 +1 @@ -gpu_memory_utilization: 0.8 +gpu_memory_utilization: 0.6 diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml index 30d16e4..7d008a6 100644 --- a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml +++ b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml @@ -1 +1 @@ -gpu_memory_utilization: 0.8 +gpu_memory_utilization: 0.6 From c032e0466a12ebee9bd27969fc5e06f0bfd367ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Mon, 12 May 2025 18:20:05 +0200 Subject: [PATCH 07/10] mistralai/Mixtral-8x7B-Instruct-v0.1: set tensor-parallel-size=2 --- mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml b/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml index bfc5627..a8c675c 100644 --- a/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml +++ b/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml @@ -2,3 +2,4 @@ model: 'mistralai/Mixtral-8x7B-Instruct-v0.1' trust-remote-code: true max-model-len: 16384 +tensor-parallel: 2 From 51867fe28afedc2399a5cfc6821ab36ac0ad4a5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 13 May 2025 11:27:38 +0200 Subject: [PATCH 08/10] cleanup configs --- Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml | 7 ++++++- .../accuracy/server-rocm.yml | 5 +++++ .../accuracy/server-rocm.yml | 5 +++++ .../accuracy/server-rocm.yml | 5 +++++ .../accuracy/server-rocm.yml | 5 +++++ .../accuracy/server-rocm.yml | 5 +++++ .../accuracy/server-rocm.yml | 5 +++++ RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml | 5 +++++ .../granite-3.1-8b-instruct/accuracy/server-rocm.yml | 5 +++++ meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml | 2 ++ .../accuracy/server-rocm.yml | 5 +++++ .../accuracy/server-rocm.yml | 5 +++++ .../Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml | 4 ++-- 13 files changed, 60 insertions(+), 3 deletions(-) diff --git a/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml b/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml index 7d008a6..40bace5 100644 --- a/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml +++ b/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml @@ -1 +1,6 @@ -gpu_memory_utilization: 0.6 +# common/accuracy/server.yml +trust-remote-code: true +tensor-parallel-size: 1 +max-model-len: 16384 +# override +gpu_memory_utilization: 0.8 diff --git a/RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic/accuracy/server-rocm.yml index 7d008a6..f2d1abc 100644 --- a/RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic/accuracy/server-rocm.yml +++ b/RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic/accuracy/server-rocm.yml @@ -1 +1,6 @@ +# common/accuracy/server.yml +trust-remote-code: true +tensor-parallel-size: 1 +max-model-len: 16384 +# override gpu_memory_utilization: 0.6 diff --git a/RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-Dynamic/accuracy/server-rocm.yml b/RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-Dynamic/accuracy/server-rocm.yml index 30d16e4..40bace5 100644 --- a/RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-Dynamic/accuracy/server-rocm.yml +++ b/RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-Dynamic/accuracy/server-rocm.yml @@ -1 +1,6 @@ +# common/accuracy/server.yml +trust-remote-code: true +tensor-parallel-size: 1 +max-model-len: 16384 +# override gpu_memory_utilization: 0.8 diff --git a/RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/accuracy/server-rocm.yml b/RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/accuracy/server-rocm.yml index 30d16e4..40bace5 100644 --- a/RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/accuracy/server-rocm.yml +++ b/RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/accuracy/server-rocm.yml @@ -1 +1,6 @@ +# common/accuracy/server.yml +trust-remote-code: true +tensor-parallel-size: 1 +max-model-len: 16384 +# override gpu_memory_utilization: 0.8 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server-rocm.yml index 7d008a6..f2d1abc 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server-rocm.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server-rocm.yml @@ -1 +1,6 @@ +# common/accuracy/server.yml +trust-remote-code: true +tensor-parallel-size: 1 +max-model-len: 16384 +# override gpu_memory_utilization: 0.6 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server-rocm.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server-rocm.yml index 7d008a6..f2d1abc 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server-rocm.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server-rocm.yml @@ -1 +1,6 @@ +# common/accuracy/server.yml +trust-remote-code: true +tensor-parallel-size: 1 +max-model-len: 16384 +# override gpu_memory_utilization: 0.6 diff --git a/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml index 7d008a6..f2d1abc 100644 --- a/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml +++ b/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml @@ -1 +1,6 @@ +# common/accuracy/server.yml +trust-remote-code: true +tensor-parallel-size: 1 +max-model-len: 16384 +# override gpu_memory_utilization: 0.6 diff --git a/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml index 7d008a6..f2d1abc 100644 --- a/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml +++ b/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml @@ -1 +1,6 @@ +# common/accuracy/server.yml +trust-remote-code: true +tensor-parallel-size: 1 +max-model-len: 16384 +# override gpu_memory_utilization: 0.6 diff --git a/ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml b/ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml index 7d008a6..f2d1abc 100644 --- a/ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml +++ b/ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml @@ -1 +1,6 @@ +# common/accuracy/server.yml +trust-remote-code: true +tensor-parallel-size: 1 +max-model-len: 16384 +# override gpu_memory_utilization: 0.6 diff --git a/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml b/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml index c57ac61..f2d1abc 100644 --- a/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml +++ b/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml @@ -1,4 +1,6 @@ +# common/accuracy/server.yml trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 +# override gpu_memory_utilization: 0.6 diff --git a/mistralai/Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml b/mistralai/Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml index 7d008a6..f2d1abc 100644 --- a/mistralai/Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml +++ b/mistralai/Mistral-Small-24B-Instruct-2501/accuracy/server-rocm.yml @@ -1 +1,6 @@ +# common/accuracy/server.yml +trust-remote-code: true +tensor-parallel-size: 1 +max-model-len: 16384 +# override gpu_memory_utilization: 0.6 diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml index 7d008a6..f2d1abc 100644 --- a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml +++ b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server-rocm.yml @@ -1 +1,6 @@ +# common/accuracy/server.yml +trust-remote-code: true +tensor-parallel-size: 1 +max-model-len: 16384 +# override gpu_memory_utilization: 0.6 diff --git a/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml b/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml index a8c675c..8cc61f3 100644 --- a/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml +++ b/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml @@ -1,5 +1,5 @@ # https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 -model: 'mistralai/Mixtral-8x7B-Instruct-v0.1' +model: "mistralai/Mixtral-8x7B-Instruct-v0.1" trust-remote-code: true +tensor-parallel-size: 2 max-model-len: 16384 -tensor-parallel: 2 From cfa2b6f88007e22518299cedad52d734a077bdd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 13 May 2025 15:40:03 +0200 Subject: [PATCH 09/10] rocm: use enforce-eager to avoid OOM errors --- Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml | 2 +- .../accuracy/server-rocm.yml | 2 +- .../accuracy/server-rocm.yml | 2 +- .../accuracy/server-rocm.yml | 2 +- .../accuracy/server-rocm.yml | 2 +- .../accuracy/server-rocm.yml | 2 +- .../Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml | 2 +- RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml | 2 +- ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml | 2 +- meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml | 2 +- mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml | 4 +++- 11 files changed, 13 insertions(+), 11 deletions(-) diff --git a/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml b/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml index 40bace5..ba5cd1c 100644 --- a/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml +++ b/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml @@ -3,4 +3,4 @@ trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 # override -gpu_memory_utilization: 0.8 +enforce-eager: true diff --git a/RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic/accuracy/server-rocm.yml index f2d1abc..ba5cd1c 100644 --- a/RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic/accuracy/server-rocm.yml +++ b/RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8-dynamic/accuracy/server-rocm.yml @@ -3,4 +3,4 @@ trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 # override -gpu_memory_utilization: 0.6 +enforce-eager: true diff --git a/RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-Dynamic/accuracy/server-rocm.yml b/RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-Dynamic/accuracy/server-rocm.yml index 40bace5..ba5cd1c 100644 --- a/RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-Dynamic/accuracy/server-rocm.yml +++ b/RedHatAI/Mistral-Small-24B-Instruct-2501-FP8-Dynamic/accuracy/server-rocm.yml @@ -3,4 +3,4 @@ trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 # override -gpu_memory_utilization: 0.8 +enforce-eager: true diff --git a/RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/accuracy/server-rocm.yml b/RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/accuracy/server-rocm.yml index 40bace5..ba5cd1c 100644 --- a/RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/accuracy/server-rocm.yml +++ b/RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8/accuracy/server-rocm.yml @@ -3,4 +3,4 @@ trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 # override -gpu_memory_utilization: 0.8 +enforce-eager: true diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server-rocm.yml index f2d1abc..ba5cd1c 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server-rocm.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server-rocm.yml @@ -3,4 +3,4 @@ trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 # override -gpu_memory_utilization: 0.6 +enforce-eager: true diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server-rocm.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server-rocm.yml index f2d1abc..ba5cd1c 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server-rocm.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server-rocm.yml @@ -3,4 +3,4 @@ trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 # override -gpu_memory_utilization: 0.6 +enforce-eager: true diff --git a/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml index f2d1abc..ba5cd1c 100644 --- a/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml +++ b/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/server-rocm.yml @@ -3,4 +3,4 @@ trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 # override -gpu_memory_utilization: 0.6 +enforce-eager: true diff --git a/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml b/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml index f2d1abc..ba5cd1c 100644 --- a/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml +++ b/RedHatAI/phi-4-FP8-dynamic/accuracy/server-rocm.yml @@ -3,4 +3,4 @@ trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 # override -gpu_memory_utilization: 0.6 +enforce-eager: true diff --git a/ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml b/ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml index f2d1abc..ba5cd1c 100644 --- a/ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml +++ b/ibm-granite/granite-3.1-8b-instruct/accuracy/server-rocm.yml @@ -3,4 +3,4 @@ trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 # override -gpu_memory_utilization: 0.6 +enforce-eager: true diff --git a/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml b/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml index f2d1abc..ba5cd1c 100644 --- a/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml +++ b/meta-llama/Llama-3.1-8B-Instruct/accuracy/server-rocm.yml @@ -3,4 +3,4 @@ trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 # override -gpu_memory_utilization: 0.6 +enforce-eager: true diff --git a/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml b/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml index 8cc61f3..6df55a1 100644 --- a/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml +++ b/mistralai/Mixtral-8x7B-Instruct-v0.1/accuracy/server-rocm.yml @@ -1,5 +1,7 @@ # https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 -model: "mistralai/Mixtral-8x7B-Instruct-v0.1" +model: 'mistralai/Mixtral-8x7B-Instruct-v0.1' trust-remote-code: true tensor-parallel-size: 2 max-model-len: 16384 +# override +enforce-eager: true From 483b7d2db7c6246ee6d92669e74e25fe16ec5623 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 13 May 2025 16:31:42 +0200 Subject: [PATCH 10/10] Qwen: override gpu-memory-utilization=0.6 --- Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml b/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml index ba5cd1c..f2d1abc 100644 --- a/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml +++ b/Qwen/Qwen2.5-7B-Instruct/accuracy/server-rocm.yml @@ -3,4 +3,4 @@ trust-remote-code: true tensor-parallel-size: 1 max-model-len: 16384 # override -enforce-eager: true +gpu_memory_utilization: 0.6