neuralmagic · derekk-nm · Apr 21, 2025 · Apr 23, 2025 · Apr 23, 2025 · Apr 24, 2025
diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/server.yml
@@ -0,0 +1,6 @@
+# server configs for https://huggingface.co/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic
+model: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"
+trust-remote-code: true
+tensor-parallel-size: 2
+max-model-len: 16384
+gpu_memory_utilization: 0.5
diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic/accuracy/tasks.yml
@@ -0,0 +1,56 @@
+tasks:
+  - name: arc_challenge
+    metrics:
+      - name: acc_norm,none
+        value: 0.7287
+
+  - name: gsm8k
+    rtol: 0.3
+    metrics:
+      - name: exact_match,strict-match
+        value: 0.6247
+
+  - name: hellaswag
+    metrics:
+      - name: acc_norm,none
+        value: 0.8367
+
+  - name: mmlu
+    metrics:
+      - name: acc,none
+        value: 0.8071
+
+  - name: truthfulqa_mc2
+    rtol: 0.12
+    metrics:
+      - name: acc,none
+        value: 0.7088
+
+  - name: winogrande
+    metrics:
+      - name: acc,none
+        value: 0.8256
+
+  # following are placeholders for mid-level "leaderboard_*" tasks
+  # (OpenLLM v2) waiting for info on how to calculate the metric
+  # values from the individual sub tasks.
+
+  # - name: leaderboard_gpqa_main
+  #   metrics:
+  #     - name: acc-norm,none
+  #       value: 0.4107
+
+  # - name: leaderboard_gpqa_diamond
+  #   metrics:
+  #     - name: acc-norm,none
+  #       value: 0.4545
+
+  # - name: leaderboard_mmlu_pro
+  #   metrics:
+  #     - name: acc,none
+  #       value: 0.6686
+
+  # - name: humaneval
+  #   metrics:
+  #     - name: exact_match,none
+  #       value: 0.847
diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/server.yml
@@ -0,0 +1,6 @@
+# server configs for https://huggingface.co/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16
+model: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16"
+trust-remote-code: true
+tensor-parallel-size: 2
+max-model-len: 16384
+gpu_memory_utilization: 0.5
diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16/accuracy/tasks.yml
@@ -0,0 +1,58 @@
+tasks:
+  - name: arc_challenge
+    rtol: 0.06
+    metrics:
+      - name: acc_norm,none
+        value: 0.7218
+
+  - name: gsm8k
+    rtol: 0.25
+    metrics:
+      - name: exact_match,strict-match
+        value: 0.6634
+
+  - name: hellaswag
+    metrics:
+      - name: acc_norm,none
+        value: 0.8325
+
+  - name: mmlu
+    metrics:
+      - name: acc,none
+        value: 0.7974
+
+  - name: truthfulqa_mc2
+    rtol: 0.11
+    metrics:
+      - name: acc,none
+        value: 0.6956
+
+  - name: winogrande
+    rtol: 0.06
+    metrics:
+      - name: acc,none
+        value: 0.8343
+
+  # following are placeholders for mid-level "leaderboard_*" tasks
+  # (OpenLLM v2) waiting for info on how to calculate the metric
+  # values from the individual sub tasks.
+
+  # - name: leaderboard_gpqa_main
+  #   metrics:
+  #     - name: acc-norm,none
+  #       value: 0.471
+
+  # - name: leaderboard_gpqa_diamond
+  #   metrics:
+  #     - name: acc-norm,none
+  #       value: 0.4495
+
+  # - name: leaderboard_mmlu_pro
+  #   metrics:
+  #     - name: acc,none
+  #       value: 0.6656
+
+  # - name: humaneval
+  #   metrics:
+  #     - name: exact_match,none
+  #       value: 0.846
diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/server.yml
@@ -0,0 +1,6 @@
+# server configs for https://huggingface.co/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8
+model: "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8"
+trust-remote-code: true
+tensor-parallel-size: 2
+max-model-len: 16384
+gpu_memory_utilization: 0.5
diff --git a/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml b/RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8/accuracy/tasks.yml
@@ -0,0 +1,57 @@
+tasks:
+  - name: arc_challenge
+    rtol: 0.12
+    metrics:
+      - name: acc_norm,none
+        value: 0.7346
+
+  - name: gsm8k
+    rtol: 0.21
+    metrics:
+      - name: exact_match,strict-match
+        value: 0.7058
+
+  - name: hellaswag
+    metrics:
+      - name: acc_norm,none
+        value: 0.8226
+
+  - name: mmlu
+    metrics:
+      - name: acc,none
+        value: 0.804
+
+  - name: truthfulqa_mc2
+    rtol: 0.1
+    metrics:
+      - name: acc,none
+        value: 0.6915
+
+  - name: winogrande
+    metrics:
+      - name: acc,none
+        value: 0.809
+
+  # following are placeholders for mid-level "leaderboard_*" tasks
+  # (OpenLLM v2) waiting for info on how to calculate the metric
+  # values from the individual sub tasks.
+
+  # - name: leaderboard_gpqa_main
+  #   metrics:
+  #     - name: acc-norm,none
+  #       value: 0.4464
+
+  # - name: leaderboard_gpqa_diamond
+  #   metrics:
+  #     - name: acc-norm,none
+  #       value: 0.4192
+
+  # - name: leaderboard_mmlu_pro
+  #   metrics:
+  #     - name: acc,none
+  #       value: 0.6654
+
+  # - name: humaneval
+  #   metrics:
+  #     - name: exact_match,none
+  #       value: 0.842
diff --git a/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml b/mistralai/Mistral-Small-3.1-24B-Instruct-2503/accuracy/server.yml
@@ -0,0 +1,10 @@
+# server configs for https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503
+model: "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+trust-remote-code: true
+tensor-parallel-size: 2
+max-model-len: 16384
+gpu_memory_utilization: 0.5
+tokenizer-mode: "mistral"
+config-format: "mistral"
+load_format: "mistral"
+limit-mm-per-prompt: "image=10"