diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml new file mode 100644 index 0000000..5b785b4 --- /dev/null +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml @@ -0,0 +1,6 @@ +# server configs for https://huggingface.co/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic +model: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic" +trust-remote-code: true +tensor-parallel-size: 2 +max-model-len: 16384 +gpu_memory_utilization: 0.5 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml new file mode 100644 index 0000000..68e087c --- /dev/null +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml @@ -0,0 +1,56 @@ +tasks: + - name: arc_challenge + metrics: + - name: acc_norm,none + value: 0.7287 + + - name: gsm8k + rtol: 0.3 + metrics: + - name: exact_match,strict-match + value: 0.6247 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8367 + + - name: mmlu + metrics: + - name: acc,none + value: 0.8071 + + - name: truthfulqa_mc2 + rtol: 0.12 + metrics: + - name: acc,none + value: 0.7088 + + - name: winogrande + metrics: + - name: acc,none + value: 0.8256 + + # following are placeholders for mid-level "leaderboard_*" tasks + # (OpenLLM v2) waiting for info on how to calculate the metric + # values from the individual sub tasks. + + # - name: leaderboard_gpqa_main + # metrics: + # - name: acc-norm,none + # value: 0.4107 + + # - name: leaderboard_gpqa_diamond + # metrics: + # - name: acc-norm,none + # value: 0.4545 + + # - name: leaderboard_mmlu_pro + # metrics: + # - name: acc,none + # value: 0.6686 + + # - name: humaneval + # metrics: + # - name: exact_match,none + # value: 0.847 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml new file mode 100644 index 0000000..7db0196 --- /dev/null +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml @@ -0,0 +1,6 @@ +# server configs for https://huggingface.co/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16 +model: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16" +trust-remote-code: true +tensor-parallel-size: 2 +max-model-len: 16384 +gpu_memory_utilization: 0.5 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml new file mode 100644 index 0000000..5c0fd8f --- /dev/null +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml @@ -0,0 +1,58 @@ +tasks: + - name: arc_challenge + rtol: 0.06 + metrics: + - name: acc_norm,none + value: 0.7218 + + - name: gsm8k + rtol: 0.25 + metrics: + - name: exact_match,strict-match + value: 0.6634 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8325 + + - name: mmlu + metrics: + - name: acc,none + value: 0.7974 + + - name: truthfulqa_mc2 + rtol: 0.11 + metrics: + - name: acc,none + value: 0.6956 + + - name: winogrande + rtol: 0.06 + metrics: + - name: acc,none + value: 0.8343 + + # following are placeholders for mid-level "leaderboard_*" tasks + # (OpenLLM v2) waiting for info on how to calculate the metric + # values from the individual sub tasks. + + # - name: leaderboard_gpqa_main + # metrics: + # - name: acc-norm,none + # value: 0.471 + + # - name: leaderboard_gpqa_diamond + # metrics: + # - name: acc-norm,none + # value: 0.4495 + + # - name: leaderboard_mmlu_pro + # metrics: + # - name: acc,none + # value: 0.6656 + + # - name: humaneval + # metrics: + # - name: exact_match,none + # value: 0.846 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml new file mode 100644 index 0000000..251d985 --- /dev/null +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml @@ -0,0 +1,6 @@ +# server configs for https://huggingface.co/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 +model: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8" +trust-remote-code: true +tensor-parallel-size: 2 +max-model-len: 16384 +gpu_memory_utilization: 0.5 diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml new file mode 100644 index 0000000..90b7fb5 --- /dev/null +++ b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml @@ -0,0 +1,57 @@ +tasks: + - name: arc_challenge + rtol: 0.12 + metrics: + - name: acc_norm,none + value: 0.7346 + + - name: gsm8k + rtol: 0.21 + metrics: + - name: exact_match,strict-match + value: 0.7058 + + - name: hellaswag + metrics: + - name: acc_norm,none + value: 0.8226 + + - name: mmlu + metrics: + - name: acc,none + value: 0.804 + + - name: truthfulqa_mc2 + rtol: 0.1 + metrics: + - name: acc,none + value: 0.6915 + + - name: winogrande + metrics: + - name: acc,none + value: 0.809 + + # following are placeholders for mid-level "leaderboard_*" tasks + # (OpenLLM v2) waiting for info on how to calculate the metric + # values from the individual sub tasks. + + # - name: leaderboard_gpqa_main + # metrics: + # - name: acc-norm,none + # value: 0.4464 + + # - name: leaderboard_gpqa_diamond + # metrics: + # - name: acc-norm,none + # value: 0.4192 + + # - name: leaderboard_mmlu_pro + # metrics: + # - name: acc,none + # value: 0.6654 + + # - name: humaneval + # metrics: + # - name: exact_match,none + # value: 0.842 diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml new file mode 100644 index 0000000..809ce4f --- /dev/null +++ b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml @@ -0,0 +1,10 @@ +# server configs for https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503 +model: "mistralai/Mistral-Small-3.1-24B-Instruct-2503" +trust-remote-code: true +tensor-parallel-size: 2 +max-model-len: 16384 +gpu_memory_utilization: 0.5 +tokenizer-mode: "mistral" +config-format: "mistral" +load_format: "mistral" +limit-mm-per-prompt: "image=10" diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml index 0452569..e58e03f 100644 --- a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml +++ b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/tasks.yml @@ -1,197 +1,54 @@ -# collected vllm v0.8.3.post1 on k8s-a100-duo tasks: - - name: leaderboard_math_algebra_hard - metrics: - - name: exact_match,none - value: 0.703 - - - name: leaderboard_math_counting_and_prob_hard - metrics: - - name: exact_match,none - value: 0.489 - - - name: leaderboard_math_geometry_hard - metrics: - - name: exact_match,none - value: 0.366 - - - name: leaderboard_math_intermediate_algebra_hard - metrics: - - name: exact_match,none - value: 0.283 - - - name: leaderboard_math_num_theory_hard - metrics: - - name: exact_match,none - value: 0.476 - - - name: leaderboard_math_prealgebra_hard - metrics: - - name: exact_match,none - value: 0.695 - - - name: leaderboard_math_precalculus_hard - metrics: - - name: exact_match,none - value: 0.355 - - - name: leaderboard_bbh_boolean_expressions - metrics: - - name: acc_norm,none - value: 0.876 - - - name: leaderboard_bbh_causal_judgement - metrics: - - name: acc_norm,none - value: 0.652 - - - name: leaderboard_bbh_date_understanding - metrics: - - name: acc_norm,none - value: 0.796 - - - name: leaderboard_bbh_disambiguation_qa - metrics: - - name: acc_norm,none - value: 0.696 - - - name: leaderboard_bbh_formal_fallacies + - name: arc_challenge metrics: - name: acc_norm,none - value: 0.684 + value: 0.7278 - - name: leaderboard_bbh_geometric_shapes + - name: gsm8k metrics: - - name: acc_norm,none - value: 0.508 - - - name: leaderboard_bbh_hyperbaton - metrics: - - name: acc_norm,none - value: 0.78 - - - name: leaderboard_bbh_logical_deduction_five_objects - metrics: - - name: acc_norm,none - value: 0.632 + - name: exact_match,strict-match + value: 0.6535 - - name: leaderboard_bbh_logical_deduction_seven_objects + - name: hellaswag metrics: - name: acc_norm,none - value: 0.636 + value: 0.837 - - name: leaderboard_bbh_logical_deduction_three_objects + - name: mmlu metrics: - - name: acc_norm,none - value: 0.876 + - name: acc,none + value: 0.8067 - - name: leaderboard_bbh_movie_recommendation + - name: truthfulqa_mc2 metrics: - - name: acc_norm,none - value: 0.848 + - name: acc,none + value: 0.7062 - - name: leaderboard_bbh_navigate + - name: winogrande metrics: - - name: acc_norm,none - value: 0.688 + - name: acc,none + value: 0.8374 - - name: leaderboard_bbh_object_counting - metrics: - - name: acc_norm,none - value: 0.42 + # following are placeholders for mid-level "leaderboard_*" tasks + # (OpenLLM v2) waiting for info on how to calculate the metric + # values from the individual sub tasks. - - name: leaderboard_bbh_penguins_in_a_table - metrics: - - name: acc_norm,none - value: 0.767 + # - name: leaderboard_gpqa_main + # metrics: + # - name: acc-norm,none + # value: 0.4263 - - name: leaderboard_bbh_reasoning_about_colored_objects - metrics: - - name: acc_norm,none - value: 0.764 + # - name: leaderboard_gpqa_diamond + # metrics: + # - name: acc-norm,none + # value: 0.4596 - - name: leaderboard_bbh_ruin_names - metrics: - - name: acc_norm,none - value: 0.868 + # - name: leaderboard_mmlu_pro + # metrics: + # - name: acc,none + # value: 0.6725 - - name: leaderboard_bbh_salient_translation_error_detection - metrics: - - name: acc_norm,none - value: 0.684 - - - name: leaderboard_bbh_snarks - metrics: - - name: acc_norm,none - value: 0.725 - - - name: leaderboard_bbh_sports_understanding - metrics: - - name: acc_norm,none - value: 0.836 - - - name: leaderboard_bbh_temporal_sequences - metrics: - - name: acc_norm,none - value: 0.984 - - - name: leaderboard_bbh_tracking_shuffled_objects_five_objects - metrics: - - name: acc_norm,none - value: 0.288 - - - name: leaderboard_bbh_tracking_shuffled_objects_seven_objects - metrics: - - name: acc_norm,none - value: 0.224 - - - name: leaderboard_bbh_tracking_shuffled_objects_three_objects - metrics: - - name: acc_norm,none - value: 0.348 - - - name: leaderboard_bbh_web_of_lies - metrics: - - name: acc_norm,none - value: 0.52 - - - name: leaderboard_gpqa_diamond - metrics: - - name: acc_norm,none - value: 0.399 - - - name: leaderboard_gpqa_extended - metrics: - - name: acc_norm,none - value: 0.405 - - - name: leaderboard_gpqa_main - metrics: - - name: acc_norm,none - value: 0.393 - - - name: leaderboard_musr_murder_mysteries - metrics: - - name: acc_norm,none - value: 0.556 - - - name: leaderboard_musr_object_placements - metrics: - - name: acc_norm,none - value: 0.437 - - - name: leaderboard_musr_team_allocation - metrics: - - name: acc_norm,none - value: 0.404 - - - name: leaderboard_ifeval - metrics: - - name: prompt_level_strict_acc,none - value: 0.582 - - name: prompt_level_loose_acc,none - value: 0.647 - - name: inst_level_loose_acc,none - value: 0.748 - - name: inst_level_strict_acc,none - value: 0.693 + # - name: humaneval + # metrics: + # - name: exact_match,none + # value: 0.847