From e8f19009bbc3c3ba9bea2d97ade10f83682593e2 Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Mon, 9 Jun 2025 17:59:02 -0400
Subject: [PATCH 01/14] enable CI with TRITON_INTERPRET=1; format trace.py
 using pre-commit

---
 .github/workflows/python-app.yml | 12 ++++++++----
 triton_viz/core/trace.py         | 10 ++++++++--
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 3bc0f3e1..6adabd6a 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -5,11 +5,13 @@ name: Python application
 
 on:
   push:
-    branches-ignore:
-      - '**'
+    branches:
+      - main
+      - keren/v2.0
   pull_request:
-    branches-ignore:
-      - '**'
+    branches:
+      - main
+      - keren/v2.0
 
 permissions:
   contents: read
@@ -54,6 +56,8 @@ jobs:
         pip install -e .
 
     - name: Test with pytest
+      env: |
+        TRITON_INTERPRET=1
       run: |
         cd triton_viz
         python -m pytest tests
diff --git a/triton_viz/core/trace.py b/triton_viz/core/trace.py
index 44ff8d5e..ef4f2ed1 100644
--- a/triton_viz/core/trace.py
+++ b/triton_viz/core/trace.py
@@ -34,7 +34,11 @@ def add_client(self, new_client: Union[Client, str]) -> None:
         new_client_instance = self._normalize_client(new_client)
         self.client_manager.add_clients([new_client_instance])
 
-    def __init__(self, kernel: Union[JITFunction, InterpretedFunction], client: Union[str, Client]) -> None:
+    def __init__(
+        self,
+        kernel: Union[JITFunction, InterpretedFunction],
+        client: Union[str, Client],
+    ) -> None:
         self.fn = kernel
         if isinstance(kernel, InterpretedFunction):
             self.interpreter_fn = kernel
@@ -91,7 +95,9 @@ def decorator(kernel) -> Trace:
             trace.add_client(clients)
             return trace
 
-        raise TypeError(f"Expected JITFunction, InterpretedFunction or Trace, got {type(kernel)}")
+        raise TypeError(
+            f"Expected JITFunction, InterpretedFunction or Trace, got {type(kernel)}"
+        )
 
     return decorator
 

From 4518dbce4a8ce7714a86491b104386cfebbde97b Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Mon, 9 Jun 2025 18:03:23 -0400
Subject: [PATCH 02/14] fix yaml format

---
 .github/workflows/python-app.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 6adabd6a..0edc70d0 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -57,7 +57,7 @@ jobs:
 
     - name: Test with pytest
       env: |
-        TRITON_INTERPRET=1
+        TRITON_INTERPRET: "1"
       run: |
         cd triton_viz
         python -m pytest tests

From 515da8dcf839ba3235dece2f52bf903d46e0b3a9 Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Mon, 9 Jun 2025 18:06:29 -0400
Subject: [PATCH 03/14] fix yaml 2.0

---
 .github/workflows/python-app.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 0edc70d0..75c90342 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -23,6 +23,8 @@ concurrency:
 jobs:
   build:
     runs-on: ubuntu-latest
+    env: |
+      TRITON_INTERPRET: "1"
 
     steps:
     - uses: actions/checkout@v3
@@ -56,8 +58,6 @@ jobs:
         pip install -e .
 
     - name: Test with pytest
-      env: |
-        TRITON_INTERPRET: "1"
       run: |
         cd triton_viz
         python -m pytest tests

From 8c560deeeebc70a53a050365c58f32bd5a2fdd17 Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Mon, 9 Jun 2025 18:07:43 -0400
Subject: [PATCH 04/14] remove extra '|'

---
 .github/workflows/python-app.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 75c90342..717dce85 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -23,7 +23,7 @@ concurrency:
 jobs:
   build:
     runs-on: ubuntu-latest
-    env: |
+    env:
       TRITON_INTERPRET: "1"
 
     steps:

From 76d3188d9ad4322e8a2e119301499d785f8d3209 Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Mon, 9 Jun 2025 19:30:17 -0400
Subject: [PATCH 05/14] remove GPU dependencies in unittests

---
 tests/test_config.py          | 3 ++-
 tests/test_print_traceback.py | 2 +-
 tests/test_wrapper.py         | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index c8926448..8ce9d9c9 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,4 +1,5 @@
-import pytest
+import pytest, os
+os.environ["TRITON_SANITIZER_BACKEND"] = "off"
 import triton_viz.core.config as cfg
 
 
diff --git a/tests/test_print_traceback.py b/tests/test_print_traceback.py
index 226d54e3..e09f999b 100644
--- a/tests/test_print_traceback.py
+++ b/tests/test_print_traceback.py
@@ -27,7 +27,7 @@ def kernel_A(ptr, n):
 
 
 def test_print_nested_functions():
-    x = torch.arange(4, device="cuda", dtype=torch.float32)
+    x = torch.arange(4, dtype=torch.float32)
     print("Input:", x)
 
     # We'll launch a grid bigger than x.numel() to force a out-of-bounds error
diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py
index 925c521f..54028c4a 100644
--- a/tests/test_wrapper.py
+++ b/tests/test_wrapper.py
@@ -59,6 +59,7 @@ def _decorator(fn):
         env = os.environ.copy()
         env["PYTHONPATH"] = str(tmp_path) + os.pathsep + env.get("PYTHONPATH", "")
         env["TRITON_SANITIZER_BACKEND"] = "symexec"
+        env["TRITON_INTERPRET"] = "1"
 
         # run the dummy program using triton-sanitizer
         cmd = ["triton-sanitizer", str(tmp_path / "dummy_program.py")]

From 3ddc8bc1703539b9269cfdeace9cb194305c75ba Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Mon, 9 Jun 2025 19:36:52 -0400
Subject: [PATCH 06/14] remove GPU dependencies in test_autotune_add.py

---
 tests/test_autotune_add.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/test_autotune_add.py b/tests/test_autotune_add.py
index e0ece54e..480fd43a 100644
--- a/tests/test_autotune_add.py
+++ b/tests/test_autotune_add.py
@@ -9,6 +9,17 @@
 
 cfg.sanitizer_backend = "symexec"
 
+# skip benchmarking in CPU-only environments,
+# as do_bench relies on GPUs
+if not torch.backends.cuda.is_built():
+    import triton.testing
+    triton.testing.do_bench = lambda *_, **__: [0.0, 0.0, 0.0]
+    from triton.runtime import autotuner
+    orig_init = autotuner.Autotuner.__init__
+    def patched_init(self, *args, **kwargs):
+        orig_init(self, *args, **kwargs)
+        self.do_bench = lambda *_, **__: [0.0, 0.0, 0.0]
+    autotuner.Autotuner.__init__ = patched_init
 
 @triton.autotune(
     configs=[

From 6af5d721f7ef9f2af5fe1c89a3bf192b1138b7d1 Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Mon, 9 Jun 2025 19:38:40 -0400
Subject: [PATCH 07/14] remove cuda tensors

---
 tests/test_autotune_add.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_autotune_add.py b/tests/test_autotune_add.py
index 480fd43a..6ae010b8 100644
--- a/tests/test_autotune_add.py
+++ b/tests/test_autotune_add.py
@@ -50,8 +50,8 @@ def test_autotune_add_inrange():
     This test uses n_elements = 128, matching the size of the input tensors.
     It should NOT cause any out-of-bound access.
     """
-    x = torch.randn(128, device="cuda")
-    y = torch.randn(128, device="cuda")
+    x = torch.randn(128)
+    y = torch.randn(128)
     out = torch.empty_like(x)
 
     # The kernel launch uses n_elements=128, aligned with the tensor size.
@@ -66,8 +66,8 @@ def test_autotune_add_out_of_bound():
     This test deliberately sets n_elements = 256, exceeding the actual buffer size (128).
     It will likely cause out-of-bound reads/writes, which may trigger errors or warnings.
     """
-    x = torch.randn(128, device="cuda")
-    y = torch.randn(128, device="cuda")
+    x = torch.randn(128)
+    y = torch.randn(128)
     out = torch.empty_like(x)
 
     # The kernel launch uses n_elements=256, exceeding the valid tensor size.

From 6dbc7eb27c4eb7e7fefd7adc3b2a693b0c7d26f6 Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Mon, 9 Jun 2025 20:12:30 -0400
Subject: [PATCH 08/14] using newest triton

---
 .github/workflows/python-app.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 717dce85..c06084e5 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -50,7 +50,9 @@ jobs:
 
     - name: Clone Triton and Install
       run: |
-        pip install triton==3.1.0
+        git clone https://github.com/openai/triton.git
+        pip install -r python/requirements.txt
+        pip install -e .
 
     - name: Install Triton-Viz
       run: |

From 636066b32c12cb4b94009505c3f63e71641011bc Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Mon, 9 Jun 2025 20:15:08 -0400
Subject: [PATCH 09/14] fix triton installation

---
 .github/workflows/python-app.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index c06084e5..e3cb9f9a 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -51,6 +51,7 @@ jobs:
     - name: Clone Triton and Install
       run: |
         git clone https://github.com/openai/triton.git
+        cd triton
         pip install -r python/requirements.txt
         pip install -e .
 

From 1e3bd5628ff6eff0a43a2b364b0bffa9ebe06b92 Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Wed, 11 Jun 2025 13:15:49 -0400
Subject: [PATCH 10/14] skip autotuner test on CPU for now

---
 .github/workflows/python-app.yml | 7 ++-----
 tests/test_autotune_add.py       | 9 +++++++++
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index e3cb9f9a..55a18749 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -48,12 +48,9 @@ jobs:
         pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
         pip uninstall pytorch-triton -y
 
-    - name: Clone Triton and Install
+    - name: Install Triton
       run: |
-        git clone https://github.com/openai/triton.git
-        cd triton
-        pip install -r python/requirements.txt
-        pip install -e .
+        pip install triton==3.1.0
 
     - name: Install Triton-Viz
       run: |
diff --git a/tests/test_autotune_add.py b/tests/test_autotune_add.py
index 6ae010b8..c6c5411a 100644
--- a/tests/test_autotune_add.py
+++ b/tests/test_autotune_add.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 import triton
 import triton.language as tl
@@ -45,6 +46,10 @@ def add_kernel_no_mask(x_ptr, y_ptr, out_ptr, n_elements, BLOCK_SIZE: tl.constex
     tl.store(out_ptr + offsets, x_val + y_val)
 
 
+@pytest.mark.skipif(
+    not torch.backends.cuda.is_built(),
+    reason="This test requires a CUDA-enabled environment.",
+)
 def test_autotune_add_inrange():
     """
     This test uses n_elements = 128, matching the size of the input tensors.
@@ -61,6 +66,10 @@ def test_autotune_add_inrange():
     print("test_autotune_add_inrange() passed: No out-of-bound access.")
 
 
+@pytest.mark.skipif(
+    not torch.backends.cuda.is_built(),
+    reason="This test requires a CUDA-enabled environment.",
+)
 def test_autotune_add_out_of_bound():
     """
     This test deliberately sets n_elements = 256, exceeding the actual buffer size (128).

From 6cfac320abe4fc89c9754f5fffb8a2251f8803f8 Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Wed, 11 Jun 2025 13:45:29 -0400
Subject: [PATCH 11/14] delete patched_init for now

---
 tests/test_autotune_add.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tests/test_autotune_add.py b/tests/test_autotune_add.py
index c6c5411a..16a24027 100644
--- a/tests/test_autotune_add.py
+++ b/tests/test_autotune_add.py
@@ -10,18 +10,6 @@
 
 cfg.sanitizer_backend = "symexec"
 
-# skip benchmarking in CPU-only environments,
-# as do_bench relies on GPUs
-if not torch.backends.cuda.is_built():
-    import triton.testing
-    triton.testing.do_bench = lambda *_, **__: [0.0, 0.0, 0.0]
-    from triton.runtime import autotuner
-    orig_init = autotuner.Autotuner.__init__
-    def patched_init(self, *args, **kwargs):
-        orig_init(self, *args, **kwargs)
-        self.do_bench = lambda *_, **__: [0.0, 0.0, 0.0]
-    autotuner.Autotuner.__init__ = patched_init
-
 @triton.autotune(
     configs=[
         triton.Config({"BLOCK_SIZE": 32}, num_warps=1),

From 7e4253d88d2dce601ab38ea6f8c8bd58362201ac Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Wed, 11 Jun 2025 14:10:47 -0400
Subject: [PATCH 12/14] skip autotune when gpu is missing

---
 tests/test_autotune_add.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tests/test_autotune_add.py b/tests/test_autotune_add.py
index 16a24027..ccde6a25 100644
--- a/tests/test_autotune_add.py
+++ b/tests/test_autotune_add.py
@@ -8,6 +8,9 @@
 from triton_viz import config as cfg
 
 
+if not torch.backends.cuda.is_built():
+    pytest.skip("This test requires a CUDA-enabled environment.")
+
 cfg.sanitizer_backend = "symexec"
 
 @triton.autotune(
@@ -34,10 +37,6 @@ def add_kernel_no_mask(x_ptr, y_ptr, out_ptr, n_elements, BLOCK_SIZE: tl.constex
     tl.store(out_ptr + offsets, x_val + y_val)
 
 
-@pytest.mark.skipif(
-    not torch.backends.cuda.is_built(),
-    reason="This test requires a CUDA-enabled environment.",
-)
 def test_autotune_add_inrange():
     """
     This test uses n_elements = 128, matching the size of the input tensors.
@@ -54,10 +53,6 @@ def test_autotune_add_inrange():
     print("test_autotune_add_inrange() passed: No out-of-bound access.")
 
 
-@pytest.mark.skipif(
-    not torch.backends.cuda.is_built(),
-    reason="This test requires a CUDA-enabled environment.",
-)
 def test_autotune_add_out_of_bound():
     """
     This test deliberately sets n_elements = 256, exceeding the actual buffer size (128).

From ee21925b047c0fb6a3d0596ad20acf30932513f6 Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Wed, 11 Jun 2025 14:12:56 -0400
Subject: [PATCH 13/14] allow pytest skipping in module level

---
 tests/test_autotune_add.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_autotune_add.py b/tests/test_autotune_add.py
index ccde6a25..87997fb7 100644
--- a/tests/test_autotune_add.py
+++ b/tests/test_autotune_add.py
@@ -9,7 +9,7 @@
 
 
 if not torch.backends.cuda.is_built():
-    pytest.skip("This test requires a CUDA-enabled environment.")
+    pytest.skip("This test requires a CUDA-enabled environment.", allow_module_level=True)
 
 cfg.sanitizer_backend = "symexec"
 

From 5d04beb2fa82aaa1a3b631ef5f3ff99fa4daf7d5 Mon Sep 17 00:00:00 2001
From: Hao Wu <hwu27@gmu.edu>
Date: Wed, 11 Jun 2025 14:25:51 -0400
Subject: [PATCH 14/14] use try-catch to skip autotuner test

---
 tests/test_autotune_add.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_autotune_add.py b/tests/test_autotune_add.py
index 87997fb7..5c1cf79c 100644
--- a/tests/test_autotune_add.py
+++ b/tests/test_autotune_add.py
@@ -8,7 +8,9 @@
 from triton_viz import config as cfg
 
 
-if not torch.backends.cuda.is_built():
+try:
+    torch.cuda.current_device()
+except:
     pytest.skip("This test requires a CUDA-enabled environment.", allow_module_level=True)
 
 cfg.sanitizer_backend = "symexec"