pytorch · jduprat · Jul 18, 2025 · Jul 16, 2025 · huydhn · Jul 17, 2025
diff --git a/.github/workflows/flash_attention.yml b/.github/workflows/flash_attention.yml
@@ -0,0 +1,44 @@
+name: Flash Attention Benchmark
+
+# To remotely trigger a FA Benchmarking run, use the following:
+# curl -L -X POST -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" -H "Authorization: Bearer $TOKEN" https://api.github.com/repos/pytorch/pytorch-integration-testing/dispatches -d '{"event_type": "benchmark_flash_attention"}'
+
+on:
+  schedule:
+    - cron: "0 6 * * *"  # Run every day at 6AM
+  push:
+    paths:
+      - .github/workflows/flash_attention.yml
+  repository_dispatch:
+    types: benchmark_flash_attention
+  workflow_dispatch: 
+jobs:
+  benchmark-flash-attn:
+    name: Flash Attention CuTe DSL Benchmark
+    runs-on: B200
+    container:
+      # https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/
+      image: nvcr.io/nvidia/pytorch:25.06-py3
+      options: --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          repository: 'Dao-AILab/flash-attention'
+          path: 'fa4'
+      - name: Install CuTe DSL
+        run: |
+          set -x
+          echo "Installing nvidia-cutlass-dsl"
+          pip install nvidia-cutlass-dsl==4.1.0.dev0
+      - name: Buid and Run FlashAttention CuTe DSL
+        run: |
+          set -x
+          pushd fa4
+          python setup.py install
+
+          echo '<h1>B200 1000W</h1>' >> $GITHUB_STEP_SUMMARY
+          nvidia-smi
+          export PYTHONPATH=$(pwd)
+          python benchmarks/benchmark_attn.py >> $GITHUB_STEP_SUMMARY
+
+          popd