Add script to test op perf and compare

yeahdongcn · daniandtheweb · yeahdongcn · commit 64f137e4cf5b · 2025-06-25T08:42:55.000+08:00
Co-authored-by: Daniele &lt;daniele.dilotorres@gmail.com&gt;
Signed-off-by: Xiaodong Ye &lt;xiaodong.ye@mthreads.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -137,6 +137,9 @@ poetry.toml
 /tests/test-tokenizer-1-bpe
 /tests/test-tokenizer-1-spm
 
+# Test reports
+comparison_backend_ops_perf.txt
+
 # Scripts
 !/scripts/install-oneapi.bat
 
diff --git a/scripts/compare-commits-op-perf.sh b/scripts/compare-commits-op-perf.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+if [ $# -lt 2 ]; then
+    echo "usage: ./scripts/compare-commits-op-perf.sh <commit1> <commit2> [additional test-backend-ops arguments]"
+    exit 1
+fi
+
+set -e
+set -x
+
+test_backend_ops_args="${@:3}"
+
+# Extract short form of commits (first 7 characters)
+commit1_short=$(echo $1 | cut -c1-7)
+commit2_short=$(echo $2 | cut -c1-7)
+
+rm -f test-backend-ops-perf-*.log
+
+# to test a backend, call the script with the corresponding environment variable (e.g. GGML_CUDA=1 ./scripts/compare-commits.sh ...)
+if [ -n "$GGML_CUDA" ]; then
+    CMAKE_OPTS="${CMAKE_OPTS} -DGGML_CUDA=ON"
+fi
+
+dir="build-test-backend-ops"
+
+function run {
+    commit_short=$1
+    rm -fr ${dir} > /dev/null
+    cmake -B ${dir} -S . ${CMAKE_OPTS} > /dev/null
+    cmake --build ${dir} -t test-backend-ops > /dev/null
+    ${dir}/bin/test-backend-ops $test_backend_ops_args perf 2>&1 | tee test-backend-ops-perf-${commit_short}.log
+}
+
+git checkout $1 > /dev/null
+run $commit1_short
+
+git checkout $2 > /dev/null
+run $commit2_short
+
+./scripts/compare-test-backend-ops-perf.py -b test-backend-ops-perf-$commit1_short.log -c test-backend-ops-perf-$commit2_short.log
diff --git a/scripts/compare-test-backend-ops-perf.py b/scripts/compare-test-backend-ops-perf.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+
+import argparse
+import logging
+import re
+import sys
+from pathlib import Path
+
+# Set up logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+
+def parse_benchmark_line(line: str):
+    """
+    Parses a single line of benchmark output.
+
+    Example line:
+    MUL_MAT(...): 744 runs - 1660.11 us/run - 134.48 MFLOP/run - 81.01 GFLOPS
+
+    Returns a tuple of (key, gflops) or (None, None) if parsing fails.
+    """
+    line = line.strip()
+    if ":" not in line:
+        return None, None
+
+    key, data_part = line.split(":", 1)
+    key = key.strip()
+
+    # Remove ANSI color codes from the data part
+    data_part = re.sub(r"\x1b\[[0-9;]*m", "", data_part)
+
+    # Find the last number and unit in the data part
+    match = re.search(r"([\d\.]+)\s+(GFLOPS|TFLOPS|MFLOPS)\s*$", data_part.strip())
+    if not match:
+        return None, None
+
+    value_str, unit = match.groups()
+    value = float(value_str)
+
+    # Normalize everything to GFLOPS
+    if unit == "TFLOPS":
+        gflops = value * 1000
+    elif unit == "MFLOPS":
+        gflops = value / 1000
+    else:  # GFLOPS
+        gflops = value
+
+    return key, gflops
+
+
+def extract_commit_id(filepath: Path) -> str:
+    """Extract commit ID from filename like test-backend-ops-perf-abc1234.log"""
+    filename = filepath.name
+    # Pattern: test-backend-ops-perf-<commit_id>.log
+    match = re.match(r"test-backend-ops-perf-([^.]+)\.log", filename)
+    if match:
+        return match.group(1)
+    return ""
+
+
+def load_results(filepath: Path) -> dict:
+    """Loads all benchmark results from a file into a dictionary."""
+    results = {}
+    try:
+        with open(filepath, "r", encoding="utf-8") as f:
+            for line in f:
+                key, gflops = parse_benchmark_line(line)
+                if key:
+                    results[key] = gflops
+    except FileNotFoundError:
+        logger.error(f"Error: File not found at {filepath}")
+        sys.exit(1)
+    return results
+
+
+def format_change(change: float) -> str:
+    """Formats the percentage change."""
+    if change > 0.1:
+        return f"+{change:.2f}%"
+    elif change < -0.1:
+        return f"{change:.2f}%"
+    else:
+        return " ~0.00%"
+
+
+def main():
+    """Main function to compare benchmark files."""
+    parser = argparse.ArgumentParser(
+        description="Compare two benchmark result files and generate a report.",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    help_b = "Path to the baseline benchmark results file."
+    parser.add_argument(
+        "-b", "--baseline", dest="baseline", type=Path, required=True, help=help_b
+    )
+    help_c = "Path to the benchmark results file to compare against the baseline."
+    parser.add_argument(
+        "-c", "--compare", dest="compare", type=Path, required=True, help=help_c
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        default="comparison_backend_ops_perf.txt",
+        help="Path to the output report file (default: comparison_backend_ops_perf.txt).",
+    )
+    args = parser.parse_args()
+
+    logger.info(f"Loading baseline results from: {args.baseline}")
+    baseline_results = load_results(args.baseline)
+    logger.info(f"Loading compare results from: {args.compare}")
+    compare_results = load_results(args.compare)
+
+    if not baseline_results or not compare_results:
+        logger.error("Could not load results from one or both files. Exiting.")
+        return
+
+    # Extract commit IDs from filenames
+    baseline_commit = extract_commit_id(args.baseline)
+    compare_commit = extract_commit_id(args.compare)
+
+    all_keys = sorted(list(set(baseline_results.keys()) | set(compare_results.keys())))
+
+    comparisons = []
+
+    for key in all_keys:
+        baseline_val = baseline_results.get(key)
+        compare_val = compare_results.get(key)
+
+        entry = {
+            "key": key,
+            "baseline": baseline_val,
+            "compare": compare_val,
+            "change": 0,
+        }
+
+        if baseline_val is not None and compare_val is not None:
+            entry["change"] = ((compare_val - baseline_val) / baseline_val) * 100
+
+        comparisons.append(entry)
+
+    # --- Generate Report ---
+    with open(args.output, "w", encoding="utf-8") as f:
+
+        # Create header with commit IDs extracted from filenames
+        baseline_header = "Baseline GFLOPS"
+        compare_header = "Compare GFLOPS"
+
+        if baseline_commit:
+            baseline_header = f"Baseline ({baseline_commit}) GFLOPS"
+        if compare_commit:
+            compare_header = f"Compare ({compare_commit}) GFLOPS"
+
+        key_width = max(len(k) for k in all_keys) + 2
+        header = f"{'Test Configuration':<{key_width}} {baseline_header:>25} {compare_header:>25} {'Change (%)':>15}"
+        f.write(header + "\n")
+        f.write("-" * len(header) + "\n")
+
+        for item in comparisons:
+            baseline_str = (
+                f"{item['baseline']:.2f}" if item["baseline"] is not None else "N/A"
+            )
+            compare_str = (
+                f"{item['compare']:.2f}" if item["compare"] is not None else "N/A"
+            )
+            change_str = format_change(item["change"])
+            f.write(
+                f"{item['key']:<{key_width}} {baseline_str:>25} {compare_str:>25} {change_str:>15}\n"
+            )
+
+    logger.info(f"Comparison report successfully generated at: {args.output}")
+
+
+if __name__ == "__main__":
+    main()