Flash Attention Benchmarking on B200

jduprat · jduprat · commit 16bf895720a6 · 2025-07-16T17:24:56.000-07:00
Checkout the FA repo and run benchmark as part of this action
diff --git a/.github/workflows/flash_attention.yml b/.github/workflows/flash_attention.yml
@@ -0,0 +1,46 @@
+name: Flash Attention Benchmark
+
+# To run on every commit to the FA repo, need gto add the following trigger to the FA repo 
+# run: |
+# curl -XPOST -u "${{ secrets.PAT_USERNAME}}:${{secrets.PAT_TOKEN}}" -H "Accept: application/vnd.github.everest-preview+json" -H "Content-Type: application/json" https://api.github.com/repos/pytorch/pytorch-integration-testing/dispatches --data '{"event_type": "benchmark_flash_attention"}'
+
+on:
+  schedule:
+    # Run every 2 hours
+    - cron: '0 */2 * * *'
+  push:
+    paths:
+      - .github/workflows/flash_attention.yml
+  repository_dispatch:
+    types: benchmark_flash_attention
+  workflow_dispatch: 
+jobs:
+  benchmark-flash-attn:
+    name: Flash Attention CuTe DSL Benchmark
+    runs-on: B200
+    container:
+      # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/
+      image: nvcr.io/nvidia/pytorch:25.06-py3
+      options: --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864
+    steps:
+      - name: Buid and Run FlashAttention CuTe DSL
+        run: |
+          set -x
+          export CUDA_VISIBLE_DEVICES=0
+          echo "Installing nvidia-cutlass-dsl"
+          pip install nvidia-cutlass-dsl==4.1.0.dev0
+          echo "Installing Flash Attention"
+          rm -fr fa4
+          git clone https://github.com/Dao-AILab/flash-attention.git fa4
+          pushd fa4
+          pwd
+          git log -1
+          python setup.py install
+          export PYTHONPATH=$(pwd)
+
+          echo '<h1>B200 1000W</h1>' >> $GITHUB_STEP_SUMMARY
+          nvidia-smi
+          python benchmarks/benchmark_attn.py >> $GITHUB_STEP_SUMMARY
+
+          popd
+          rm -fr fa4