Flash Attention Benchmarking on B200

jduprat · jduprat · commit 149bc7168091 · 2025-07-16T16:30:03.000-07:00
Checkout the FA repo and run benchmark as part of this action
diff --git a/.github/workflows/flash_attention.yml b/.github/workflows/flash_attention.yml
@@ -0,0 +1,48 @@
+name: Flash Attention Benchmark
+
+# attempt 0.1
+# Run on every commit to the FA repo
+# FA repo will run this on every commit
+# run: |
+# curl -XPOST -u "${{ secrets.PAT_USERNAME}}:${{secrets.PAT_TOKEN}}" -H "Accept: application/vnd.github.everest-preview+json" -H "Content-Type: application/json" https://api.github.com/repos/pytorch/pytorch-integration-testing/dispatches --data '{"event_type": "benchmark_flash_attention"}'
+
+on:
+  schedule:
+    # Run every 2 hours
+    - cron: '0 */2 * * *'
+  push:
+    paths:
+      - .github/workflows/flash_attention.yml
+  workflow_dispatch: # Allow manual triggering
+  repository_dispatch:
+    types: benchmark_flash_attention
+jobs:
+  benchmark-flash-attn:
+    name: Flash Attention CuTe DSL Benchmark
+    runs-on: B200
+    container:
+      # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/
+      image: nvcr.io/nvidia/pytorch:25.06-py3
+      options: --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864
+    steps:
+      - name: CuTe DSL
+        #with:
+        #  runner: linux.g5.48xlarge.nvidia.gpu
+        #  gpu-arch-type: cuda
+        #  gpu-arch-version: "12.9.1"
+        #  timeout: 600
+        run: |
+          set -x
+          export CUDA_VISIBLE_DEVICES=0
+          echo "Installing nvidia-cutlass-dsl"
+          pip install nvidia-cutlass-dsl==4.1.0.dev0
+          echo "Installing Flash Attention"
+          git clone https://github.com/Dao-AILab/flash-attention.git fa4
+          pushd fa4
+          git log -1
+          python setup.py build
+          python setup.py install
+          export PYTHONPATH=$(pwd)
+          nvidia-smi
+          python benchmarks/benchmark_attn.py >> $GITHUB_STEP_SUMMARY
+          popd