From 8a5651230f53aedc85f5b5872125a0278fffe7ee Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Wed, 23 Apr 2025 21:00:54 -0400 Subject: [PATCH 1/4] adjust rtol on certain tasks to match measured --- Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml | 3 +++ RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/tasks.yml | 2 ++ .../Qwen2.5-7B-Instruct-quantized.w4a16/accuracy/tasks.yml | 2 ++ RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml | 2 ++ RedHatAI/Qwen2.5-7B-quantized.w4a16/accuracy/tasks.yml | 2 ++ 5 files changed, 11 insertions(+) diff --git a/Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml b/Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml index 1b8d8e8..1ecaf24 100644 --- a/Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml +++ b/Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml @@ -1,5 +1,6 @@ tasks: - name: arc_challenge + rtol: 0.09 metrics: - name: acc_norm,none value: 0.5939 @@ -20,11 +21,13 @@ tasks: value: 0.7415 - name: truthfulqa_mc2 + rtol: 0.13 metrics: - name: acc,none value: 0.5637 - name: winogrande + rtol: 0.09 metrics: - name: acc,none value: 0.7569 diff --git a/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/tasks.yml b/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/tasks.yml index a896989..8b1ec2a 100644 --- a/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/tasks.yml +++ b/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/tasks.yml @@ -1,5 +1,6 @@ tasks: - name: arc_challenge + rtol: 0.16 metrics: - name: acc_norm,none value: 0.6314 @@ -25,6 +26,7 @@ tasks: value: 0.6487 - name: winogrande + rtol: 0.07 metrics: - name: acc,none value: 0.7443 diff --git a/RedHatAI/Qwen2.5-7B-Instruct-quantized.w4a16/accuracy/tasks.yml b/RedHatAI/Qwen2.5-7B-Instruct-quantized.w4a16/accuracy/tasks.yml index 3b21a6c..f630424 100644 --- a/RedHatAI/Qwen2.5-7B-Instruct-quantized.w4a16/accuracy/tasks.yml +++ b/RedHatAI/Qwen2.5-7B-Instruct-quantized.w4a16/accuracy/tasks.yml @@ -1,5 +1,6 @@ tasks: - name: arc_challenge + rtol: 0.13 metrics: - name: acc_norm,none value: 0.6323 @@ -25,6 +26,7 @@ tasks: value: 0.6427 - name: winogrande + rtol: 0.07 metrics: - name: acc,none value: 0.7419 diff --git a/RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml b/RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml index 6047049..5f44534 100644 --- a/RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml +++ b/RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml @@ -1,5 +1,6 @@ tasks: - name: arc_challenge + rtol: 0.21 metrics: - name: acc_norm,none value: 0.6323 @@ -25,6 +26,7 @@ tasks: value: 0.6458 - name: winogrande + rtol: 0.05 metrics: - name: acc,none value: 0.7482 diff --git a/RedHatAI/Qwen2.5-7B-quantized.w4a16/accuracy/tasks.yml b/RedHatAI/Qwen2.5-7B-quantized.w4a16/accuracy/tasks.yml index da02244..7094fc4 100644 --- a/RedHatAI/Qwen2.5-7B-quantized.w4a16/accuracy/tasks.yml +++ b/RedHatAI/Qwen2.5-7B-quantized.w4a16/accuracy/tasks.yml @@ -1,5 +1,6 @@ tasks: - name: arc_challenge + rtol: 0.16 metrics: - name: acc_norm,none value: 0.587 @@ -25,6 +26,7 @@ tasks: value: 0.5548 - name: winogrande + rtol: 0.09 metrics: - name: acc,none value: 0.7601 From 049b782f99a5df4a8f3fab5cf1daac8f357527ab Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Wed, 23 Apr 2025 21:25:21 -0400 Subject: [PATCH 2/4] update qwen 2.5 model task rtol --- Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml | 3 ++- RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/tasks.yml | 1 + .../Qwen2.5-7B-Instruct-quantized.w4a16/accuracy/tasks.yml | 3 ++- RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml | 1 + RedHatAI/Qwen2.5-7B-quantized.w4a16/accuracy/tasks.yml | 2 +- 5 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml b/Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml index 1ecaf24..9a254d0 100644 --- a/Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml +++ b/Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml @@ -6,6 +6,7 @@ tasks: value: 0.5939 - name: gsm8k + rtol: 0.05 metrics: - name: exact_match,strict-match value: 0.7976 @@ -21,7 +22,7 @@ tasks: value: 0.7415 - name: truthfulqa_mc2 - rtol: 0.13 + rtol: 0.15 metrics: - name: acc,none value: 0.5637 diff --git a/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/tasks.yml b/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/tasks.yml index 8b1ec2a..c5f2873 100644 --- a/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/tasks.yml +++ b/RedHatAI/Qwen2.5-7B-Instruct-FP8-dynamic/accuracy/tasks.yml @@ -6,6 +6,7 @@ tasks: value: 0.6314 - name: gsm8k + rtol: 0.06 metrics: - name: exact_match,strict-match value: 0.8006 diff --git a/RedHatAI/Qwen2.5-7B-Instruct-quantized.w4a16/accuracy/tasks.yml b/RedHatAI/Qwen2.5-7B-Instruct-quantized.w4a16/accuracy/tasks.yml index f630424..0805ecf 100644 --- a/RedHatAI/Qwen2.5-7B-Instruct-quantized.w4a16/accuracy/tasks.yml +++ b/RedHatAI/Qwen2.5-7B-Instruct-quantized.w4a16/accuracy/tasks.yml @@ -1,11 +1,12 @@ tasks: - name: arc_challenge - rtol: 0.13 + rtol: 0.15 metrics: - name: acc_norm,none value: 0.6323 - name: gsm8k + rtol: 0.09 metrics: - name: exact_match,strict-match value: 0.8059 diff --git a/RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml b/RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml index 5f44534..d88669e 100644 --- a/RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml +++ b/RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml @@ -6,6 +6,7 @@ tasks: value: 0.6323 - name: gsm8k + rtol: 0.07 metrics: - name: exact_match,strict-match value: 0.8074 diff --git a/RedHatAI/Qwen2.5-7B-quantized.w4a16/accuracy/tasks.yml b/RedHatAI/Qwen2.5-7B-quantized.w4a16/accuracy/tasks.yml index 7094fc4..cb78516 100644 --- a/RedHatAI/Qwen2.5-7B-quantized.w4a16/accuracy/tasks.yml +++ b/RedHatAI/Qwen2.5-7B-quantized.w4a16/accuracy/tasks.yml @@ -1,6 +1,6 @@ tasks: - name: arc_challenge - rtol: 0.16 + rtol: 0.17 metrics: - name: acc_norm,none value: 0.587 From d843b7afbe9e13179fda9d4e66c9159f6a184f33 Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Thu, 24 Apr 2025 05:38:03 -0400 Subject: [PATCH 3/4] another tweak --- RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml b/RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml index d88669e..19cbea7 100644 --- a/RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml +++ b/RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8/accuracy/tasks.yml @@ -6,7 +6,7 @@ tasks: value: 0.6323 - name: gsm8k - rtol: 0.07 + rtol: 0.09 metrics: - name: exact_match,strict-match value: 0.8074 From 9a0db40d223d411a0e3d257773e225694b7eec96 Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Fri, 2 May 2025 15:46:39 -0400 Subject: [PATCH 4/4] correct task metric values --- Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml b/Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml index 9a254d0..833914d 100644 --- a/Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml +++ b/Qwen/Qwen2.5-7B-Instruct/accuracy/tasks.yml @@ -3,33 +3,33 @@ tasks: rtol: 0.09 metrics: - name: acc_norm,none - value: 0.5939 + value: 0.634 - name: gsm8k rtol: 0.05 metrics: - name: exact_match,strict-match - value: 0.7976 + value: 0.8036 - name: hellaswag metrics: - name: acc_norm,none - value: 0.8017 + value: 0.8152 - name: mmlu metrics: - name: acc,none - value: 0.7415 + value: 0.7424 - name: truthfulqa_mc2 rtol: 0.15 metrics: - name: acc,none - value: 0.5637 + value: 0.6476 - name: winogrande rtol: 0.09 metrics: - name: acc,none - value: 0.7569 + value: 0.7466