Add script + configs (#157)

drisspg · web-flow · commit 189dc56bb477 · 2025-07-16T20:20:59.000-07:00
diff --git a/benchmarks/configs/config_basic.yaml b/benchmarks/configs/config_basic.yaml
@@ -0,0 +1,29 @@
+# Basic benchmark configuration for flex attention
+# Usage: python flex_perf.py --config config_basic.yaml
+
+# Core parameters
+dynamic: false
+calculate_bwd: false
+dtype: "bfloat16"
+
+# Shape parameters
+b: [2, 8, 16]  # batch sizes
+nh: ["16, 16", "16, 4"]  # [query_heads,key_value_heads]
+s: [1024, 4096]  # sequence lengths
+d: [64, 128]  # head dimensions
+
+# Attention types to benchmark
+mods: ["noop", "causal", "alibi", "sliding_window"]
+
+# Backend and optimization
+backend: []
+max_autotune: false
+
+# Decoding and cache settings
+decoding: false
+kv_size: null  # Use batch sizes instead
+
+# Metrics and output
+throughput: true  # Always calculate TBS and TFLOPs
+show_speedups: false  # Show speedup calculations
+save_path: null  # No CSV output
diff --git a/benchmarks/configs/config_comprehensive.yaml b/benchmarks/configs/config_comprehensive.yaml
@@ -0,0 +1,29 @@
+# Comprehensive benchmark configuration for flex attention
+# Usage: python flex_perf.py --config config_comprehensive.yaml
+
+# Core parameters
+dynamic: false
+calculate_bwd: true  # Include backward pass timing
+dtype: "bfloat16"
+
+# Shape parameters - larger sweep
+b: [1, 2, 4, 8, 16, 32]  # batch sizes
+nh: ["16,16", "16,2", "32,32", "32,4"]  # [query_heads,key_value_heads]
+s: [512, 1024, 2048, 4096, 8192]  # sequence lengths
+d: [64, 128, 256]  # head dimensions
+
+# All attention types
+mods: ["noop", "causal", "rel", "head_bias", "alibi", "sliding_window", "prefix_lm", "softcap"]
+
+# Multiple backends for comparison
+backend: ["efficient", "math", "fav2"]
+max_autotune: true
+
+# Decoding and cache settings
+decoding: false
+kv_size: null
+
+# Metrics and output
+throughput: true  # Always calculate TBS and TFLOPs
+show_speedups: true  # Show speedup calculations
+save_path: "comprehensive_results.csv"  # Save to CSV
diff --git a/benchmarks/configs/config_decoding.yaml b/benchmarks/configs/config_decoding.yaml
@@ -0,0 +1,29 @@
+# Decoding benchmark configuration for flex attention
+# Usage: python flex_perf.py --config config_decoding.yaml
+
+# Core parameters
+dynamic: false
+calculate_bwd: false  # Decoding doesn't support backward
+dtype: "bfloat16"
+
+# Shape parameters for decoding (query length = 1)
+b: [1, 4, 8, 16]  # batch sizes
+nh: ["16,16", "16,2", "32,32"]  # [query_heads,key_value_heads]
+s: [1024, 2048, 4096, 8192]  # KV sequence lengths
+d: [64, 128]  # head dimensions
+
+# Attention types suitable for decoding
+mods: ["causal", "alibi", "sliding_window", "softcap"]
+
+# Backends including decoding-optimized ones
+backend: ["efficient", "fav2", "fakv"]
+max_autotune: false
+
+# Decoding and cache settings
+decoding: true  # Enable decoding mode
+kv_size: null
+
+# Metrics and output
+throughput: true  # Always calculate TBS and TFLOPs
+show_speedups: false  # Focus on raw performance metrics
+save_path: "decoding_results.csv"
diff --git a/benchmarks/configs/config_memory_bound.yaml b/benchmarks/configs/config_memory_bound.yaml
@@ -0,0 +1,29 @@
+# Memory-bound benchmark configuration for flex attention
+# Usage: python flex_perf.py --config config_memory_bound.yaml
+
+# Core parameters
+dynamic: false
+calculate_bwd: false
+dtype: "bfloat16"
+
+# Shape parameters - focus on memory efficiency
+b: [1, 2, 4]  # smaller batch sizes
+nh: ["16,16", "32,32"]  # [query_heads,key_value_heads]
+s: [4096, 8192, 16384]  # longer sequences
+d: [128, 256]  # larger head dimensions
+
+# Attention types that benefit from memory optimization
+mods: ["causal", "sliding_window", "document_mask"]
+
+# Efficient backends
+backend: ["efficient", "fav2"]
+max_autotune: true
+
+# Use KV cache size instead of batch size
+decoding: false
+kv_size: [256, 512, 1024]  # KV cache size in MiB
+
+# Metrics and output
+throughput: true  # Always calculate TBS and TFLOPs
+show_speedups: true
+save_path: "memory_bound_results.csv"
diff --git a/benchmarks/flex_perf.py b/benchmarks/flex_perf.py
diff --git a/pyproject.toml b/pyproject.toml