From 5a692165baabac6feac45b22fe0bde22c3d52c62 Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Mon, 21 Apr 2025 13:43:04 -0400 Subject: [PATCH 1/9] tasks.yml for Mistral-Small-3.1-24B-Instruct-2503 --- .../accuracy/tasks.yml | 54 +++++ .../accuracy/tasks.yml | 54 +++++ .../accuracy/tasks.yml | 54 +++++ .../accuracy/tasks.yml | 213 +++--------------- 4 files changed, 197 insertions(+), 178 deletions(-) create mode 100644 RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml create mode 100644 RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml create mode 100644 RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml new file mode 100644 index 0000000..c496623 --- /dev/null +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml @@ -0,0 +1,54 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.7287 + + - name: gsm8k + metrics: + - name: strict_match,none + value: 0.6247 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8467 + + - name: mmlu + metrics: + - name: acc,none + value: 0.8071 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.7088 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8256 + + # following are placeholders for mid-level "leaderboard_*" tasks + # (OpenLLM v2) waiting for info on how to calculate the metric + # values from the individual sub tasks. + + # - name: leaderboard_gpqa_main + # metrics: + # - name: acc-norm,none + # value: 0.4107 + + # - name: leaderboard_gpqa_diamond + # metrics: + # - name: acc-norm,none + # value: 0.4545 + + # - name: leaderboard_mmlu_pro + # metrics: + # - name: acc,none + # value: 0.6686 + + # - name: humaneval + # metrics: + # - name: exact_match,none + # value: 0.847 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml new file mode 100644 index 0000000..ea9ff15 --- /dev/null +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml @@ -0,0 +1,54 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.7218 + + - name: gsm8k + metrics: + - name: strict_match,none + value: 0.6634 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8325 + + - name: mmlu + metrics: + - name: acc,none + value: 0.7974 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.6956 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8343 + + # following are placeholders for mid-level "leaderboard_*" tasks + # (OpenLLM v2) waiting for info on how to calculate the metric + # values from the individual sub tasks. + + # - name: leaderboard_gpqa_main + # metrics: + # - name: acc-norm,none + # value: 0.471 + + # - name: leaderboard_gpqa_diamond + # metrics: + # - name: acc-norm,none + # value: 0.4495 + + # - name: leaderboard_mmlu_pro + # metrics: + # - name: acc,none + # value: 0.6656 + + # - name: humaneval + # metrics: + # - name: exact_match,none + # value: 0.846 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml new file mode 100644 index 0000000..fd88f02 --- /dev/null +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml @@ -0,0 +1,54 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.7346 + + - name: gsm8k + metrics: + - name: strict_match,none + value: 0.7058 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8226 + + - name: mmlu + metrics: + - name: acc,none + value: 0.804 + + - name: truthfulqa_mc2 + metrics: + - name: acc,none + value: 0.6915 + + - name: winogrande + metrics: + - name: acc,none + value: 0.809 + + # following are placeholders for mid-level "leaderboard_*" tasks + # (OpenLLM v2) waiting for info on how to calculate the metric + # values from the individual sub tasks. + + # - name: leaderboard_gpqa_main + # metrics: + # - name: acc-norm,none + # value: 0.4464 + + # - name: leaderboard_gpqa_diamond + # metrics: + # - name: acc-norm,none + # value: 0.4192 + + # - name: leaderboard_mmlu_pro + # metrics: + # - name: acc,none + # value: 0.6654 + + # - name: humaneval + # metrics: + # - name: exact_match,none + # value: 0.842 diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml index 0452569..8232701 100644 --- a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml +++ b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml @@ -1,197 +1,54 @@ -# collected vllm v0.8.3.post1 on k8s-a100-duo tasks: - - name: leaderboard_math_algebra_hard - metrics: - - name: exact_match,none - value: 0.703 - - - name: leaderboard_math_counting_and_prob_hard - metrics: - - name: exact_match,none - value: 0.489 - - - name: leaderboard_math_geometry_hard - metrics: - - name: exact_match,none - value: 0.366 - - - name: leaderboard_math_intermediate_algebra_hard - metrics: - - name: exact_match,none - value: 0.283 - - - name: leaderboard_math_num_theory_hard - metrics: - - name: exact_match,none - value: 0.476 - - - name: leaderboard_math_prealgebra_hard - metrics: - - name: exact_match,none - value: 0.695 - - - name: leaderboard_math_precalculus_hard - metrics: - - name: exact_match,none - value: 0.355 - - - name: leaderboard_bbh_boolean_expressions - metrics: - - name: acc_norm,none - value: 0.876 - - - name: leaderboard_bbh_causal_judgement - metrics: - - name: acc_norm,none - value: 0.652 - - - name: leaderboard_bbh_date_understanding - metrics: - - name: acc_norm,none - value: 0.796 - - - name: leaderboard_bbh_disambiguation_qa - metrics: - - name: acc_norm,none - value: 0.696 - - - name: leaderboard_bbh_formal_fallacies + - name: arc_challenge metrics: - name: acc_norm,none - value: 0.684 + value: 0.7278 - - name: leaderboard_bbh_geometric_shapes + - name: gsm8k metrics: - - name: acc_norm,none - value: 0.508 - - - name: leaderboard_bbh_hyperbaton - metrics: - - name: acc_norm,none - value: 0.78 - - - name: leaderboard_bbh_logical_deduction_five_objects - metrics: - - name: acc_norm,none - value: 0.632 + - name: strict_match,none + value: 0.6535 - - name: leaderboard_bbh_logical_deduction_seven_objects + - name: hellaswag metrics: - name: acc_norm,none - value: 0.636 + value: 0.837 - - name: leaderboard_bbh_logical_deduction_three_objects + - name: mmlu metrics: - - name: acc_norm,none - value: 0.876 + - name: acc,none + value: 0.8067 - - name: leaderboard_bbh_movie_recommendation + - name: truthfulqa_mc2 metrics: - - name: acc_norm,none - value: 0.848 + - name: acc,none + value: 0.7062 - - name: leaderboard_bbh_navigate + - name: winogrande metrics: - - name: acc_norm,none - value: 0.688 + - name: acc,none + value: 0.8374 - - name: leaderboard_bbh_object_counting - metrics: - - name: acc_norm,none - value: 0.42 + # following are placeholders for mid-level "leaderboard_*" tasks + # (OpenLLM v2) waiting for info on how to calculate the metric + # values from the individual sub tasks. - - name: leaderboard_bbh_penguins_in_a_table - metrics: - - name: acc_norm,none - value: 0.767 + # - name: leaderboard_gpqa_main + # metrics: + # - name: acc-norm,none + # value: 0.4263 - - name: leaderboard_bbh_reasoning_about_colored_objects - metrics: - - name: acc_norm,none - value: 0.764 + # - name: leaderboard_gpqa_diamond + # metrics: + # - name: acc-norm,none + # value: 0.4596 - - name: leaderboard_bbh_ruin_names - metrics: - - name: acc_norm,none - value: 0.868 + # - name: leaderboard_mmlu_pro + # metrics: + # - name: acc,none + # value: 0.6725 - - name: leaderboard_bbh_salient_translation_error_detection - metrics: - - name: acc_norm,none - value: 0.684 - - - name: leaderboard_bbh_snarks - metrics: - - name: acc_norm,none - value: 0.725 - - - name: leaderboard_bbh_sports_understanding - metrics: - - name: acc_norm,none - value: 0.836 - - - name: leaderboard_bbh_temporal_sequences - metrics: - - name: acc_norm,none - value: 0.984 - - - name: leaderboard_bbh_tracking_shuffled_objects_five_objects - metrics: - - name: acc_norm,none - value: 0.288 - - - name: leaderboard_bbh_tracking_shuffled_objects_seven_objects - metrics: - - name: acc_norm,none - value: 0.224 - - - name: leaderboard_bbh_tracking_shuffled_objects_three_objects - metrics: - - name: acc_norm,none - value: 0.348 - - - name: leaderboard_bbh_web_of_lies - metrics: - - name: acc_norm,none - value: 0.52 - - - name: leaderboard_gpqa_diamond - metrics: - - name: acc_norm,none - value: 0.399 - - - name: leaderboard_gpqa_extended - metrics: - - name: acc_norm,none - value: 0.405 - - - name: leaderboard_gpqa_main - metrics: - - name: acc_norm,none - value: 0.393 - - - name: leaderboard_musr_murder_mysteries - metrics: - - name: acc_norm,none - value: 0.556 - - - name: leaderboard_musr_object_placements - metrics: - - name: acc_norm,none - value: 0.437 - - - name: leaderboard_musr_team_allocation - metrics: - - name: acc_norm,none - value: 0.404 - - - name: leaderboard_ifeval - metrics: - - name: prompt_level_strict_acc,none - value: 0.582 - - name: prompt_level_loose_acc,none - value: 0.647 - - name: inst_level_loose_acc,none - value: 0.748 - - name: inst_level_strict_acc,none - value: 0.693 + # - name: humaneval + # metrics: + # - name: exact_match,none + # value: 0.847 From 0da61cb1ac9a12cd4a04a0fa066600c74d088d81 Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Wed, 23 Apr 2025 09:10:16 -0400 Subject: [PATCH 2/9] tweak some rtol --- .../accuracy/tasks.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml index ea9ff15..655a79f 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml @@ -1,5 +1,6 @@ tasks: - name: arc_challenge + rtol: 0.06 metrics: - name: acc_norm,none value: 0.7218 @@ -20,11 +21,13 @@ tasks: value: 0.7974 - name: truthfulqa_mc2 + rtol: 0.11 metrics: - name: acc,none value: 0.6956 - name: winogrande + rtol: 0.06 metrics: - name: acc,none value: 0.8343 From 9d400674817cdf0c604bbc8d533fb29ddfcb180d Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Wed, 23 Apr 2025 12:08:10 -0400 Subject: [PATCH 3/9] rtol and server config changes --- .../accuracy/tasks.yml | 2 ++ .../Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml index fd88f02..f63959a 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml @@ -1,5 +1,6 @@ tasks: - name: arc_challenge + rtol: 0.12 metrics: - name: acc_norm,none value: 0.7346 @@ -20,6 +21,7 @@ tasks: value: 0.804 - name: truthfulqa_mc2 + rtol: 0.1 metrics: - name: acc,none value: 0.6915 diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml index 95f0f9c..585446c 100644 --- a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml +++ b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml @@ -6,7 +6,4 @@ tensor-parallel-size: 1 max-model-len: 4096 tokenizer-mode: "mistral" config-format: "mistral" -load-format: "mistral" -tool-call-parser: "mistral" -enable-auto-tool-choice: true limit-mm-per-prompt: "image=10" From cba51ca7c702ce6a2f64d7c8e8774f8d5b08f2d4 Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Thu, 24 Apr 2025 12:30:23 -0400 Subject: [PATCH 4/9] bring back "load_format" server option --- .../Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml index 585446c..85a7239 100644 --- a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml +++ b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml @@ -6,4 +6,5 @@ tensor-parallel-size: 1 max-model-len: 4096 tokenizer-mode: "mistral" config-format: "mistral" +load_format: "mistral" limit-mm-per-prompt: "image=10" From 07b910bfb6ba69173097e3c2e06b83a6571faedc Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Thu, 24 Apr 2025 15:46:38 -0400 Subject: [PATCH 5/9] adjust server settings per RedHatAI model cards --- .../accuracy/server.yml | 5 +++-- .../accuracy/server.yml | 5 +++-- .../accuracy/server.yml | 5 +++-- .../Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml | 5 +++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml index d90de0e..b6cd2b9 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml @@ -2,5 +2,6 @@ model: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic" trust-remote-code: true enable-chunked-prefill: true -tensor-parallel-size: 1 -max-model-len: 4096 +tensor-parallel-size: 2 +max-model-len: 8192 +gpu_memory_utilization: 0.5 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml index aa76495..ebe3a16 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml @@ -2,5 +2,6 @@ model: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16" trust-remote-code: true enable-chunked-prefill: true -tensor-parallel-size: 1 -max-model-len: 4096 +tensor-parallel-size: 2 +max-model-len: 8192 +gpu_memory_utilization: 0.5 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml index 021dd77..0154249 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml @@ -2,5 +2,6 @@ model: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8" trust-remote-code: true enable-chunked-prefill: true -tensor-parallel-size: 1 -max-model-len: 4096 +tensor-parallel-size: 2 +max-model-len: 8192 +gpu_memory_utilization: 0.5 diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml index 85a7239..29fad50 100644 --- a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml +++ b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml @@ -2,8 +2,9 @@ model: "mistralai/Mistral-Small-3.1-24B-Instruct-2503" trust-remote-code: true enable-chunked-prefill: true -tensor-parallel-size: 1 -max-model-len: 4096 +tensor-parallel-size: 2 +max-model-len: 8192 +gpu_memory_utilization: 0.5 tokenizer-mode: "mistral" config-format: "mistral" load_format: "mistral" From 2568889e548d21c44a4e5af5ba1846652d388377 Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Thu, 24 Apr 2025 16:05:58 -0400 Subject: [PATCH 6/9] fix gsm8k metric name --- .../accuracy/tasks.yml | 2 +- .../accuracy/tasks.yml | 2 +- .../accuracy/tasks.yml | 2 +- .../Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml index c496623..eba7b7f 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml @@ -6,7 +6,7 @@ tasks: - name: gsm8k metrics: - - name: strict_match,none + - name: exact_match,strict-match value: 0.6247 - name: hellaswag diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml index 655a79f..2a87140 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml @@ -7,7 +7,7 @@ tasks: - name: gsm8k metrics: - - name: strict_match,none + - name: exact_match,strict-match value: 0.6634 - name: hellaswag diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml index f63959a..217a95d 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml @@ -7,7 +7,7 @@ tasks: - name: gsm8k metrics: - - name: strict_match,none + - name: exact_match,strict-match value: 0.7058 - name: hellaswag diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml index 8232701..e58e03f 100644 --- a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml +++ b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml @@ -6,7 +6,7 @@ tasks: - name: gsm8k metrics: - - name: strict_match,none + - name: exact_match,strict-match value: 0.6535 - name: hellaswag From 9d3a42ba491c9f86d9db26082cb1e43f4a37b566 Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Thu, 24 Apr 2025 17:01:48 -0400 Subject: [PATCH 7/9] set gsm8k rtol --- .../accuracy/tasks.yml | 1 + .../accuracy/tasks.yml | 1 + .../accuracy/tasks.yml | 1 + 3 files changed, 3 insertions(+) diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml index eba7b7f..8a77857 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml @@ -5,6 +5,7 @@ tasks: value: 0.7287 - name: gsm8k + rtol: 0.28 metrics: - name: exact_match,strict-match value: 0.6247 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml index 2a87140..5c0fd8f 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml @@ -6,6 +6,7 @@ tasks: value: 0.7218 - name: gsm8k + rtol: 0.25 metrics: - name: exact_match,strict-match value: 0.6634 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml index 217a95d..90b7fb5 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml @@ -6,6 +6,7 @@ tasks: value: 0.7346 - name: gsm8k + rtol: 0.21 metrics: - name: exact_match,strict-match value: 0.7058 From 82e50c90969dabcbcc9b450b0ada9ae4c4753b4a Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Fri, 25 Apr 2025 06:36:11 -0400 Subject: [PATCH 8/9] tweak rtol --- .../accuracy/tasks.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml index 8a77857..aceb998 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml @@ -5,7 +5,7 @@ tasks: value: 0.7287 - name: gsm8k - rtol: 0.28 + rtol: 0.3 metrics: - name: exact_match,strict-match value: 0.6247 @@ -21,6 +21,7 @@ tasks: value: 0.8071 - name: truthfulqa_mc2 + rtol: 0.12 metrics: - name: acc,none value: 0.7088 From f066ad7250adf517f66abd5b74ff85836bafc81c Mon Sep 17 00:00:00 2001 From: Derek Kozikowski <106621615+derekk-nm@users.noreply.github.com> Date: Fri, 2 May 2025 14:43:12 -0400 Subject: [PATCH 9/9] fix a metric value typo, recover model server.yml but w/ common values for shared attributes where appropriate. --- .../accuracy/server.yml | 3 +-- .../accuracy/tasks.yml | 2 +- .../accuracy/server.yml | 3 +-- .../accuracy/server.yml | 3 +-- .../Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml | 3 +-- 5 files changed, 5 insertions(+), 9 deletions(-) diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml index b6cd2b9..5b785b4 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml @@ -1,7 +1,6 @@ # server configs for https://huggingface.co/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic model: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic" trust-remote-code: true -enable-chunked-prefill: true tensor-parallel-size: 2 -max-model-len: 8192 +max-model-len: 16384 gpu_memory_utilization: 0.5 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml index aceb998..68e087c 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml @@ -13,7 +13,7 @@ tasks: - name: hellaswag metrics: - name: acc_norm,none - value: 0.8467 + value: 0.8367 - name: mmlu metrics: diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml index ebe3a16..7db0196 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml @@ -1,7 +1,6 @@ # server configs for https://huggingface.co/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16 model: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16" trust-remote-code: true -enable-chunked-prefill: true tensor-parallel-size: 2 -max-model-len: 8192 +max-model-len: 16384 gpu_memory_utilization: 0.5 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml index 0154249..251d985 100644 --- a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml @@ -1,7 +1,6 @@ # server configs for https://huggingface.co/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 model: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8" trust-remote-code: true -enable-chunked-prefill: true tensor-parallel-size: 2 -max-model-len: 8192 +max-model-len: 16384 gpu_memory_utilization: 0.5 diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml index 29fad50..809ce4f 100644 --- a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml +++ b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml @@ -1,9 +1,8 @@ # server configs for https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503 model: "mistralai/Mistral-Small-3.1-24B-Instruct-2503" trust-remote-code: true -enable-chunked-prefill: true tensor-parallel-size: 2 -max-model-len: 8192 +max-model-len: 16384 gpu_memory_utilization: 0.5 tokenizer-mode: "mistral" config-format: "mistral"