Add MI300X specs to roofline benchmark (#1913)

mreso · web-flow · commit 2db02923c297 · 2025-03-18T06:31:16.000-07:00
* Add MI300X specs to roofline benchmark

* Fix shape mismatch error by correcting grad_output shape

* Adjust benchmark command in README.md
diff --git a/benchmarks/float8/float8_roofline.py b/benchmarks/float8/float8_roofline.py
@@ -372,7 +372,7 @@ def run(
             ).requires_grad_()
 
             # get the gradient of the right shape
-            grad_output = torch.randn(N_val, K_val, dtype=torch.bfloat16, device="cuda")
+            grad_output = torch.randn(M_val, N_val, dtype=torch.bfloat16, device="cuda")
 
             # get the bf16 gpu kernel time
             torch._dynamo.reset()
diff --git a/torchao/float8/README.md b/torchao/float8/README.md
@@ -139,7 +139,7 @@ Example 2 (large shapes):
 To reproduce the raw data for table above, you can run the following script
 
 ```lang=shell
-python benchmarks/float8/float8_roofline.py your_output_filename.csv --gemm_time_strategy benchmarks --shape_gen_name sweep
+python benchmarks/float8/float8_roofline.py your_output_filename.csv --shape_gen_name sweep
 ```
 
 ## Derivation
diff --git a/torchao/testing/float8/roofline_utils.py b/torchao/testing/float8/roofline_utils.py
@@ -41,6 +41,19 @@
         # TODO(future): measure once we have the hardware
         "pct_achievable_mem_bw": 0.92,
     },
+    "AMD Instinct MI300X": {
+        # https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/data-sheets/amd-instinct-mi300x-data-sheet.pdf, page 1,
+        "bf16_peak_tops": 1307e12,
+        "fp8_peak_tops": 2614e12,
+        # 5.3 TB per second
+        "peak_mem_bw_bytes_sec": 5.3e12,
+        # for now, copy over from H100
+        # TODO(future): run measurement on hardware
+        "pct_achievable_gemm_tops": 0.78,
+        # for now, copy over from H100
+        # TODO(future): run measurement on hardware
+        "pct_achievable_mem_bw": 0.92,
+    },
     # TODO(future): more GPU names
 }