Merge pull request #94 from OpenMOSS/zf_fix

Hzfinfdu · web-flow · commit 59ce8ac67859 · 2025-02-19T20:18:37.000+08:00
feat(kernels): support spmm triton kernel for topk saes.
diff --git a/src/lm_saes/config.py b/src/lm_saes/config.py
@@ -354,6 +354,9 @@ class FeatureAnalyzerConfig(BaseConfig):
 
     sample_weight_exponent: float = 2.0
     """ Exponent for weighting samples by activation value """
+    
+    ignore_token_ids: Optional[list[int | None]] = None
+    """ Tokens to ignore in the activations. """
 
     subsamples: dict[str, dict[str, int | float]] = Field(
         default_factory=lambda: {"top_activations": {"proportion": 1.0, "n_samples": 10}}
diff --git a/src/lm_saes/crosscoder.py b/src/lm_saes/crosscoder.py
@@ -292,7 +292,11 @@ def compute_loss(
     @torch.no_grad()
     def log_statistics(self):
         assert self.dataset_average_activation_norm is not None
-        return {f"info/{k}": v for k, v in self.dataset_average_activation_norm.items()}
+        log_dict = {
+            'metrics/mean_jumprelu_threshold': all_reduce_tensor(self.log_jumprelu_threshold.exp(), aggregate='sum'),
+            'metrics/current_l1_coefficient':self.current_l1_coefficient,
+        }
+        return log_dict
 
     def initialize_with_same_weight_across_layers(self):
         self.encoder.weight.data = get_tensor_from_specific_rank(self.encoder.weight.data.clone(), src=0)
diff --git a/src/lm_saes/kernels.py b/src/lm_saes/kernels.py
@@ -236,7 +236,7 @@ def triton_dense_dense_sparseout_matmul(
     assert dense2.stride(0) == 1, "dense2 must be contiguous along B"
 
     if K > 512:
-        # print("WARN - using naive matmul for large K")
+        print("WARN - using naive matmul for large K")
         # naive is more efficient for large K
         return (dense1 @ dense2).gather(1, at_indices)
 
@@ -378,7 +378,6 @@ def triton_sparse_dense_matmul_kernel(
     tl.store(out_ptr + pid * B + offsets_b, accum.to(sparse_values.dtype), mask=offsets_b < B)
 
 
-@torch.no_grad()
 def get_sparse_representation(x, pad_val=0):
     """
     Efficiently extracts sparse indices and values from a batched dense tensor x.
@@ -421,33 +420,37 @@ def get_sparse_representation(x, pad_val=0):
 
 class TritonDecoderAutograd(torch.autograd.Function):
     @staticmethod
-    def forward(ctx, sparse_indices, sparse_values, decoder_weight):
-        ctx.save_for_backward(sparse_indices, sparse_values, decoder_weight)
+    def forward(ctx, feature_acts, decoder_weight, require_precise_feature_acts_grad: bool = True):
+        sparse_indices, sparse_values = get_sparse_representation(feature_acts)
+        ctx.save_for_backward(sparse_indices, sparse_values, decoder_weight, torch.tensor(require_precise_feature_acts_grad))
         return triton_sparse_dense_matmul(sparse_indices, sparse_values, decoder_weight.T)
 
     @staticmethod
     def backward(ctx, *grad_outputs, **args):
         assert len(grad_outputs) == 1, "grad_outputs must be a single tensor"
         grad_output = grad_outputs[0]
-        sparse_indices, sparse_values, decoder_weight = ctx.saved_tensors
+        sparse_indices, sparse_values, decoder_weight, require_precise_feature_acts_grad = ctx.saved_tensors
 
         assert grad_output.is_contiguous(), "grad_output must be contiguous; this is probably because the subsequent op was a .sum() or something like that, which returns a non contiguous gradient"
 
         decoder_grad = triton_sparse_transpose_dense_matmul(
-            sparse_indices, sparse_values, grad_output, N=decoder_weight.shape[1]
+            sparse_indices, sparse_values, grad_output, N=decoder_weight.size(1)
         ).T
+        
+        if require_precise_feature_acts_grad.item():
+            feature_acts_grad = grad_output @ decoder_weight
+        else:
+            feature_acts_grad_sparse = triton_dense_dense_sparseout_matmul(grad_output, decoder_weight, sparse_indices)  # batch_size, K
+            B, d_sae = sparse_indices.size(0), decoder_weight.size(1)
+            feature_acts_grad = torch.zeros(size=(B, d_sae)).to(sparse_values).scatter_(dim=1, index=sparse_indices, src=feature_acts_grad_sparse)
 
-        return (
-            None,
-            triton_dense_dense_sparseout_matmul(grad_output, decoder_weight, sparse_indices),
-            # decoder is contiguous when transposed so this is a matching layout
-            decoder_grad,
-            None,
-        )
+        # decoder is contiguous when transposed so this is a matching layout
+        return feature_acts_grad, decoder_grad, None
+            
 
 
 def decode_with_triton_spmm_kernel(
-    feature_acts: Float[torch.Tensor, "batch d_sae"], decoder_weight: Float[torch.Tensor, "d_model d_sae"]
+    feature_acts: Float[torch.Tensor, "batch d_sae"], decoder_weight: Float[torch.Tensor, "d_model d_sae"], require_precise_feature_acts_grad: bool
 ):
     """
     Perform sparse-dense matrix multiplication using Triton.
@@ -459,13 +462,7 @@ def decode_with_triton_spmm_kernel(
     Returns:
         output: (B, d_model) - The decoded output.
     """
-    # Convert dense feature_acts into sparse representation
-    sparse_indices, sparse_values = get_sparse_representation(feature_acts)
-
-    # Perform sparse-dense multiplication using Triton
-    output = TritonDecoderAutograd.apply(sparse_indices, sparse_values, decoder_weight.T.contiguous().T)
-
-    return output
+    return TritonDecoderAutograd.apply(feature_acts, decoder_weight.T.contiguous().T, require_precise_feature_acts_grad)
 
 
 if __name__ == "__main__":
@@ -474,69 +471,62 @@ def decode_with_triton_spmm_kernel(
     import triton
     import triton.language as tl
 
-    def test_triton_decoder_forward():
-        # Set parameters
-        B, d_sae, d_model = 4, 32, 16  # Batch size, input dim, output dim
-
-        # Create a random dense weight matrix (as in nn.Linear), size = (d_model, d_sae)
-        decoder = nn.Linear(d_sae, d_model, bias=False, dtype=torch.float32, device="cuda")
-
-        # Create a random sparse input matrix
-        dense_input = torch.randn((B, d_sae), dtype=torch.float32, device="cuda")
-
-        # Zero out some values to simulate sparsity (~70% sparsity)
-        dense_input[torch.rand_like(dense_input) < 0.7] = 0
-
-        # Run our Triton-based sparse-dense multiply
-        triton_output = decode_with_triton_spmm_kernel(dense_input, decoder.weight)
-
-        # Compare against standard dense multiply (nn.Linear equivalent)
-        torch_output = decoder(dense_input)  # Equivalent to nn.Linear
-        assert isinstance(triton_output, torch.Tensor), "triton_output is not a torch.Tensor"
-        # Ensure outputs are numerically close
-        assert torch.allclose(triton_output, torch_output, atol=1e-4), "Mismatch between Triton and PyTorch outputs!"
-
-        print("✅ Triton forward pass matches nn.Linear!")
-
-    def test_triton_decoder_backward():
+    def test_triton_decoder(B, d_sae, d_model, sparsity=0.9, dtype=torch.float32, require_precise_feature_acts_grad=True):
         # Set parameters
-        B, d_sae, d_model = 4, 32, 16  # Batch size, input dim, output dim
 
         # Create a random dense weight matrix (as in nn.Linear)
-        decoder = nn.Linear(d_sae, d_model, bias=False, dtype=torch.float32, device="cuda")
+        decoder = nn.Linear(d_sae, d_model, bias=False, dtype=dtype, device="cuda")
 
         # Create a random sparse input matrix
-        dense_input = torch.randn((B, d_sae), dtype=torch.float32, device="cuda")
+        dense_input = torch.randn((B, d_sae), dtype=dtype, device="cuda")
 
-        # Zero out some values to simulate sparsity (~70% sparsity)
-        dense_input[torch.rand_like(dense_input) < 0.7] = 0
+        # Zero out some values to simulate sparsity
+        dense_input[torch.rand_like(dense_input) < sparsity] = 0
 
         # Enable gradient tracking
         decoder.weight.requires_grad_(True)
+        dense_input.requires_grad_(True)
+        
+        grad_output = torch.randn((B, d_model), dtype=dtype, device="cuda")
 
         # Run forward pass with Triton
-        triton_output = decode_with_triton_spmm_kernel(dense_input, decoder.weight)
+        triton_output = decode_with_triton_spmm_kernel(dense_input, decoder.weight, require_precise_feature_acts_grad)
         assert isinstance(triton_output, torch.Tensor), "triton_output is not a torch.Tensor"
-        # Run forward pass with PyTorch nn.Linear
-        torch_output = decoder(dense_input)
-
-        # Generate random gradient to propagate backward
-        grad_output = torch.randn_like(torch_output)
-
-        # Backpropagate
+        
         triton_output.backward(grad_output)
+        
+        triton_decoder_weight_grad, triton_dense_input_grad = decoder.weight.grad.clone(), dense_input.grad.clone()  # pyright: ignore
+        
+        decoder.weight.grad.zero_()  # pyright: ignore
+        dense_input.grad.zero_()  # pyright: ignore
+        
+        torch_output = decoder(dense_input)
         torch_output.backward(grad_output)
+        
+        torch_decoder_weight_grad, torch_dense_input_grad = decoder.weight.grad.clone(), dense_input.grad.clone()  # pyright: ignore
 
         # Compare gradients
         assert decoder.weight.grad is not None, "decoder.weight.grad is None"
         assert torch.allclose(
-            decoder.weight.grad, decoder.weight.grad, atol=1e-4
-        ), "Mismatch between Triton and PyTorch gradients!"
+            triton_output, torch_output, atol=1e-5
+        ), "Mismatch between Triton and PyTorch outputs!"
+        assert torch.allclose(
+            triton_decoder_weight_grad, torch_decoder_weight_grad, atol=1e-5
+        ), f"Mismatch between Triton and PyTorch gradients on decoder weights! {triton_decoder_weight_grad=}, {torch_decoder_weight_grad=}"
+
+        if require_precise_feature_acts_grad:
+            assert torch.allclose(
+                triton_dense_input_grad, torch_dense_input_grad, atol=1e-5
+            ), f"Mismatch between Triton and PyTorch gradients on dense input! {triton_dense_input_grad=}, {torch_dense_input_grad=}"
+        else:
+            assert torch.allclose(
+                triton_dense_input_grad[dense_input.ne(0)], torch_dense_input_grad[dense_input.ne(0)], atol=1e-5
+            ), f"Mismatch between Triton and PyTorch gradients on dense input! {triton_dense_input_grad=}, {torch_dense_input_grad=}"
 
-        print("✅ Triton backward pass matches nn.Linear!")
+        print("✅ Triton forward and backward pass matches nn.Linear!")
 
     # Ensure we have the Triton-based kernel
-    def benchmark_triton_vs_torch(B=32, d_sae=512, d_model=256, sparsity=0.7, warmup=5, iters=20):
+    def benchmark_triton_vs_torch(B=32, d_sae=512, d_model=256, sparsity=0.7, warmup=5, iters=20, dtype=torch.float32, require_precise_feature_acts_grad=True):
         """
         Benchmarks Triton-based sparse-dense multiplication vs PyTorch's nn.Linear.
 
@@ -549,18 +539,18 @@ def benchmark_triton_vs_torch(B=32, d_sae=512, d_model=256, sparsity=0.7, warmup
         """
 
         # Create weight matrix similar to nn.Linear
-        decoder = nn.Linear(d_sae, d_model, bias=False, dtype=torch.float32, device="cuda")
+        decoder = nn.Linear(d_sae, d_model, bias=False, dtype=dtype, device="cuda")
 
         # Generate a dense input
-        dense_input = torch.randn((B, d_sae), dtype=torch.float32, device="cuda")
+        dense_input = torch.randn((B, d_sae), dtype=dtype, device="cuda")
 
         # Introduce sparsity
         dense_input[torch.rand_like(dense_input) < sparsity] = 0
 
         # Warmup runs (to eliminate startup overhead)
         for _ in range(warmup):
             torch_output = decoder(dense_input)
-            triton_output = decode_with_triton_spmm_kernel(dense_input, decoder.weight)
+            triton_output = decode_with_triton_spmm_kernel(dense_input, decoder.weight, require_precise_feature_acts_grad)
             assert isinstance(triton_output, torch.Tensor), "triton_output is not a torch.Tensor"
             grad_output = torch.randn_like(triton_output)
             triton_output.backward(grad_output)
@@ -588,7 +578,7 @@ def benchmark_triton_vs_torch(B=32, d_sae=512, d_model=256, sparsity=0.7, warmup
 
         start_triton.record()  # type: ignore
         for _ in range(iters):
-            triton_output = decode_with_triton_spmm_kernel(dense_input, decoder.weight)
+            triton_output = decode_with_triton_spmm_kernel(dense_input, decoder.weight, require_precise_feature_acts_grad)
             assert isinstance(triton_output, torch.Tensor), "triton_output is not a torch.Tensor"
             grad_output = torch.randn_like(triton_output)
             triton_output.backward(grad_output)
@@ -603,7 +593,6 @@ def benchmark_triton_vs_torch(B=32, d_sae=512, d_model=256, sparsity=0.7, warmup
         print(f"🚀 Speedup: {torch_time / triton_time:.2f}x")
 
     # Run test
-    test_triton_decoder_forward()
-    test_triton_decoder_backward()
+    test_triton_decoder(B=16, d_sae=4096, d_model=256, sparsity=0.9, require_precise_feature_acts_grad=False)
     # Run benchmark
-    benchmark_triton_vs_torch(B=8192, d_sae=4096 * 32, d_model=4096, sparsity=0.99, warmup=10, iters=100)
+    # benchmark_triton_vs_torch(B=8192, d_sae=4096 * 32, d_model=4096, sparsity=0.99, warmup=10, iters=10, require_precise_feature_acts_grad=False)
diff --git a/src/lm_saes/runner.py b/src/lm_saes/runner.py
@@ -155,6 +155,9 @@ class GenerateActivationsSettings(BaseSettings):
 
     mongo: Optional[MongoDBConfig] = None
     """Configuration for the MongoDB database. If `None`, will not use the database."""
+    
+    ignore_token_ids: Optional[list[int]] = None
+    """ Tokens to ignore in the activations. """
 
     @model_validator(mode="after")
     def validate_cfg(self) -> "GenerateActivationsSettings":
@@ -207,6 +210,7 @@ def generate_activations(settings: GenerateActivationsSettings) -> None:
         batch_size=settings.batch_size,
         buffer_size=settings.buffer_size,
         buffer_shuffle=settings.buffer_shuffle,
+        ignore_token_ids=settings.ignore_token_ids
     )
 
     # Configure activation writer
@@ -373,6 +377,7 @@ def train_sae(settings: TrainSAESettings) -> None:
     eval_fn = (lambda x: None) if settings.eval else None
 
     trainer = Trainer(settings.trainer)
+    sae.cfg.save_hyperparameters(settings.trainer.exp_result_path)
     trainer.fit(sae=sae, activation_stream=activations_stream, eval_fn=eval_fn, wandb_logger=wandb_logger)
     sae.save_pretrained(
         save_path=settings.trainer.exp_result_path,
diff --git a/src/lm_saes/sae.py b/src/lm_saes/sae.py
@@ -367,7 +367,6 @@ def save_pretrained(
         # TODO: save dataset_average_activation_norm
         self.save_checkpoint(save_path)
         if self.device_mesh is None or self.device_mesh.get_rank() == 0:
-            self.cfg.save_hyperparameters(save_path)
             if mongo_client is not None:
                 assert (
                     sae_name is not None and sae_series is not None
@@ -489,8 +488,9 @@ def decode(
     ]:  # may be overridden by subclasses
         max_l0_in_batch = feature_acts.gt(0).to(feature_acts).sum(dim=-1).max()
         sparsity_threshold = self.cfg.d_sae * (1 - self.cfg.sparsity_threshold_for_triton_spmm_kernel)
-        if self.cfg.use_triton_kernel and max_l0_in_batch < sparsity_threshold:
-            reconstructed = decode_with_triton_spmm_kernel(feature_acts, self.decoder.weight)
+        if self.cfg.use_triton_kernel and 0 < max_l0_in_batch < sparsity_threshold:  # triton kernel cannot handle empty feature_acts
+            require_precise_feature_acts_grad = "topk" not in self.cfg.act_fn 
+            reconstructed = decode_with_triton_spmm_kernel(feature_acts, self.decoder.weight, require_precise_feature_acts_grad)
         else:
             reconstructed = self.decoder(feature_acts)
         reconstructed = self.hook_reconstructed(reconstructed)