OpenMOSS
diff --git a/‎pyproject.toml
Lines changed: 4 additions & 1 deletion b/‎pyproject.toml
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/lm_saes/__init__.py
Lines changed: 2 additions & 0 deletions b/‎src/lm_saes/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/lm_saes/activation/factory.py
Lines changed: 1 addition & 4 deletions b/‎src/lm_saes/activation/factory.py
Lines changed: 1 addition & 4 deletions
diff --git a/‎src/lm_saes/activation/processors/cached_activation.py
Lines changed: 8 additions & 3 deletions b/‎src/lm_saes/activation/processors/cached_activation.py
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/lm_saes/config.py
Lines changed: 29 additions & 2 deletions b/‎src/lm_saes/config.py
Lines changed: 29 additions & 2 deletions
diff --git a/‎src/lm_saes/crosscoder.py
Lines changed: 83 additions & 20 deletions b/‎src/lm_saes/crosscoder.py
Lines changed: 83 additions & 20 deletions
@@ -29,7 +29,7 @@ dependencies = [
     "pyyaml>=6.0.2",
     "tomlkit>=0.13.2",
     "torchvision>=0.20.1",
-    "pydantic-settings>=2.7.1",
+    "pydantic-settings>=2.7.1",   
 ]
 requires-python = "==3.12.*"
 readme = "README.md"
@@ -84,6 +84,9 @@ dev = [
     "pyfakefs>=5.7.3",
     "mongomock>=4.3.0",
 ]
+triton = [
+    "triton>=3.1.0",
+]
 
 [tool.ruff]
 exclude = [".bzr", ".direnv", ".eggs", ".git", ".git-rewrite", ".hg", ".ipynb_checkpoints", ".mypy_cache", ".nox", ".pants.d", ".pyenv", ".pytest_cache", ".pytype", ".ruff_cache", ".svn", ".tox", ".venv", ".vscode", "__pypackages__", "_build", "buck-out", "build", "dist", "node_modules", "site-packages", "venv", "TransformerLens", "ui"]
 
@@ -29,12 +29,14 @@
     generate_activations,
     train_sae,
 )
+from .sae import SparseAutoEncoder
 
 __all__ = [
     "ActivationFactory",
     "ActivationWriter",
     "CrossCoderConfig",
     "CrossCoder",
+    "SparseAutoEncoder",
     "LanguageModelConfig",
     "DatasetConfig",
     "ActivationFactoryActivationsSource",
 
@@ -22,7 +22,6 @@
     ActivationFactoryDatasetSource,
     ActivationFactoryTarget,
 )
-from lm_saes.utils.concurrent import BackgroundGenerator
 
 
 class ActivationFactory:
@@ -124,6 +123,7 @@ def _build_pre_aggregation_activations_source_processors(
             cache_dir=activations_source.path,
             hook_points=cfg.hook_points,
             device=activations_source.device,
+            dtype=activations_source.dtype,
             num_workers=activations_source.num_workers,
             prefetch_factor=activations_source.prefetch,
         )
@@ -149,9 +149,6 @@ def process_activations(**kwargs: Any):
             for processor in processors:
                 stream = processor.process(stream, ignore_token_ids=cfg.ignore_token_ids, model=model)
 
-            if activations_source.prefetch is not None:
-                stream = BackgroundGenerator(stream, max_prefetch=activations_source.prefetch)
-
             return stream
 
         return process_activations
 
@@ -2,7 +2,7 @@
 import re
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Iterable, Iterator, Sequence
+from typing import Any, Iterable, Iterator, Optional, Sequence
 
 import torch
 from safetensors.torch import load_file
@@ -84,12 +84,14 @@ def __init__(
         cache_dir: str | Path,
         hook_points: list[str],
         device: str = "cpu",
+        dtype: Optional[torch.dtype] = None,
         num_workers: int = 0,
         prefetch_factor: int | None = None,
     ):
         self.cache_dir = Path(cache_dir)
         self.hook_points = hook_points
         self.device = device
+        self.dtype = dtype
         self.num_workers = num_workers
         self.prefetch_factor = prefetch_factor
 
@@ -230,10 +232,13 @@ def process(self, data: None = None, **kwargs) -> Iterable[dict[str, Any]]:
 
         stream = self._process_chunks(hook_chunks, len(hook_chunks[self.hook_points[0]]))
         for chunk in stream:
-            yield move_dict_of_tensor_to_device(
+            activations = move_dict_of_tensor_to_device(
                 chunk,
                 device=self.device,
-            )  # Use pin_memory to load data on cpu, then transfer them to cuda in the main process, as advised in https://discuss.pytorch.org/t/dataloader-multiprocessing-with-dataset-returning-a-cuda-tensor/151022/2.
+            )
+            if self.dtype is not None:
+                activations = {k: v.to(self.dtype) for k, v in activations.items()}
+            yield activations  # Use pin_memory to load data on cpu, then transfer them to cuda in the main process, as advised in https://discuss.pytorch.org/t/dataloader-multiprocessing-with-dataset-returning-a-cuda-tensor/151022/2.
             # I wrote this utils function as I notice it is used multiple times in this repo. Do we need to apply it elsewhere?
 
 
 
@@ -60,9 +60,15 @@ class BaseSAEConfig(BaseModelConfig):
     apply_decoder_bias_to_pre_encoder: bool = False
     norm_activation: str = "dataset-wise"
     sparsity_include_decoder_norm: bool = True
+    force_unit_decoder_norm: bool = False
     top_k: int = 50
     sae_pretrained_name_or_path: Optional[str] = None
     strict_loading: bool = True
+    use_triton_kernel: bool = False
+    sparsity_threshold_for_triton_spmm_kernel: float = 0.99
+    
+    # anthropic jumprelu
+    jumprelu_threshold_window: float = 2.0
 
     @property
     def d_sae(self) -> int:
@@ -113,18 +119,25 @@ def d_sae(self) -> int:
 
 class InitializerConfig(BaseConfig):
     bias_init_method: str = "all_zero"
+    const_times_for_init_b_e: int = 10000
     init_decoder_norm: float | None = None
+    decoder_uniform_bound: float = 1.
     init_encoder_norm: float | None = None
+    encoder_uniform_bound: float = 1.
     init_encoder_with_decoder_transpose: bool = True
-    init_search: bool = True
+    init_encoder_with_decoder_transpose_factor: float = 1.
+    init_log_jumprelu_threshold_value: float | None = None
+    init_search: bool = False
     state: Literal["training", "inference"] = "training"
     l1_coefficient: float | None = 0.00008
 
 
 class TrainerConfig(BaseConfig):
-    lp: int = 1
     l1_coefficient: float | None = 0.00008
     l1_coefficient_warmup_steps: int | float = 0.1
+    sparsity_loss_type: Literal["power", "tanh", None] = None
+    tanh_stretch_coefficient: float = 4.0
+    p: int = 1
     initial_k: int | float | None = None
     k_warmup_steps: int | float = 0.1
     use_batch_norm_mse: bool = True
@@ -194,11 +207,25 @@ class ActivationFactoryDatasetSource(ActivationFactorySource):
 
 
 class ActivationFactoryActivationsSource(ActivationFactorySource):
+    model_config = ConfigDict(arbitrary_types_allowed=True)  # allow parsing torch.dtype
+    
     type: str = "activations"
     path: str
     """ The path to the cached activations. """
     device: str = "cpu"
     """ The device to load the activations on. """
+    dtype: Optional[Annotated[
+        torch.dtype,
+        BeforeValidator(lambda v: convert_str_to_torch_dtype(v) if isinstance(v, str) else v),
+        PlainSerializer(convert_torch_dtype_to_str),
+        WithJsonSchema(
+            {
+                "type": "string",
+            },
+            mode="serialization",
+        ),
+    ]] = None
+    """ We might want to convert presaved bf16 activations to fp32"""
     num_workers: int = 4
     """ The number of workers to use for loading the activations. """
     prefetch: Optional[int] = 8
 
@@ -1,4 +1,4 @@
-from typing import Literal, Union, overload
+from typing import Callable, Literal, Union, cast, overload
 
 import torch
 from jaxtyping import Float
@@ -32,6 +32,56 @@ def _decoder_norm(self, decoder: torch.nn.Linear, keepdim: bool = False, local_o
             )
         return decoder_norm
 
+    def activation_function_factory(self) -> Callable[[torch.Tensor], torch.Tensor]:
+        assert self.cfg.act_fn.lower() in [
+            "relu",
+            "topk",
+            "jumprelu",
+            "batchtopk",
+        ], f"Not implemented activation function {self.cfg.act_fn}"
+        if self.cfg.act_fn.lower() == "jumprelu":
+
+            class STEFunction(torch.autograd.Function):
+                @staticmethod
+                def forward(ctx, input: torch.Tensor, log_jumprelu_threshold: torch.Tensor):
+                    jumprelu_threshold = log_jumprelu_threshold.exp()
+                    jumprelu_threshold = all_reduce_tensor(jumprelu_threshold, aggregate="sum")
+                    ctx.save_for_backward(input, jumprelu_threshold)
+                    return input.gt(jumprelu_threshold).to(input.dtype)
+
+                @staticmethod
+                def backward(ctx, *grad_outputs: torch.Tensor):
+                    assert len(grad_outputs) == 1
+                    grad_output = grad_outputs[0]
+
+                    input, jumprelu_threshold = ctx.saved_tensors
+                    grad_input = torch.zeros_like(input)
+                    grad_log_jumprelu_threshold_unscaled = torch.where(
+                        (input - jumprelu_threshold).abs() < self.cfg.jumprelu_threshold_window * 0.5,
+                        -jumprelu_threshold / self.cfg.jumprelu_threshold_window,
+                        0.0,
+                    )
+                    grad_log_jumprelu_threshold = (
+                        grad_log_jumprelu_threshold_unscaled
+                        / torch.where(
+                            ((input - jumprelu_threshold).abs() < self.cfg.jumprelu_threshold_window * 0.5)
+                            * (input != 0.0),
+                            input,
+                            1.0,
+                        )
+                        * grad_output
+                    )
+                    grad_log_jumprelu_threshold = grad_log_jumprelu_threshold.sum(
+                        dim=tuple(range(grad_log_jumprelu_threshold.ndim - 1))
+                    )
+
+                    return grad_input, grad_log_jumprelu_threshold
+
+            return lambda x: cast(torch.Tensor, STEFunction.apply(x, self.log_jumprelu_threshold))
+
+        else:
+            return super().activation_function_factory()
+
     @overload
     def encode(
         self,
@@ -109,14 +159,14 @@ def encode(
         hidden_pre = self.hook_hidden_pre(hidden_pre)
 
         if self.cfg.sparsity_include_decoder_norm:
-            true_feature_acts = hidden_pre * self._decoder_norm(
+            sparsity_scores = hidden_pre * self._decoder_norm(
                 decoder=self.decoder,
                 local_only=True,
             )
         else:
-            true_feature_acts = hidden_pre
+            sparsity_scores = hidden_pre
 
-        activation_mask = self.activation_function(true_feature_acts)
+        activation_mask = self.activation_function(sparsity_scores)
         feature_acts = hidden_pre * activation_mask
 
         feature_acts = self.hook_feature_acts(feature_acts)
@@ -131,7 +181,9 @@ def compute_loss(
         batch: dict[str, torch.Tensor],
         *,
         use_batch_norm_mse: bool = False,
-        lp: int = 1,
+        sparsity_loss_type: Literal["power", "tanh", None] = None,
+        tanh_stretch_coefficient: float = 4.0,
+        p: int = 1,
         return_aux_data: Literal[True] = True,
         **kwargs,
     ) -> tuple[
@@ -145,7 +197,9 @@ def compute_loss(
         batch: dict[str, torch.Tensor],
         *,
         use_batch_norm_mse: bool = False,
-        lp: int = 1,
+        sparsity_loss_type: Literal["power", "tanh", None] = None,
+        tanh_stretch_coefficient: float = 4.0,
+        p: int = 1,
         return_aux_data: Literal[False],
         **kwargs,
     ) -> Float[torch.Tensor, " batch"]: ...
@@ -162,7 +216,9 @@ def compute_loss(
         ) = None,
         *,
         use_batch_norm_mse: bool = False,
-        lp: int = 1,
+        sparsity_loss_type: Literal["power", "tanh", None] = None,
+        tanh_stretch_coefficient: float = 4.0,
+        p: int = 1,
         return_aux_data: bool = True,
         **kwargs,
     ) -> Union[
@@ -194,25 +250,31 @@ def compute_loss(
                 .sqrt()
             )
 
-        l_rec = l_rec.mean()
-        l_rec = all_reduce_tensor(l_rec, aggregate="mean")
+        l_rec = l_rec.sum(dim=-1).mean()
 
         loss = l_rec
         loss_dict = {
             "l_rec": l_rec,
         }
 
-        # l_l1: (batch,)
-        feature_acts = feature_acts * self._decoder_norm(
-            decoder=self.decoder,
-            local_only=True,
-        )
-
-        if "topk" not in self.cfg.act_fn:
-            l_lp = torch.norm(feature_acts, p=lp, dim=-1)
-            loss_dict["l_lp"] = l_lp
+        if sparsity_loss_type == "power":
+            l_s = torch.norm(feature_acts * self._decoder_norm(decoder=self.decoder), p=p, dim=-1)
+            loss_dict["l_s"] = self.current_l1_coefficient * l_s.mean()
             assert self.current_l1_coefficient is not None
-            loss = loss + self.current_l1_coefficient * l_lp.mean()
+            loss = loss + self.current_l1_coefficient * l_s.mean()
+        elif sparsity_loss_type == "tanh":
+            l_s = torch.tanh(tanh_stretch_coefficient * feature_acts * self._decoder_norm(decoder=self.decoder)).sum(
+                dim=-1
+            )
+            loss_dict["l_s"] = self.current_l1_coefficient * l_s.mean()
+            assert self.current_l1_coefficient is not None
+            loss = loss + self.current_l1_coefficient * l_s.mean()
+        elif sparsity_loss_type is None:
+            pass
+        else:
+            raise ValueError(f"sparsity_loss_type f{sparsity_loss_type} not supported.")
+
+        loss = all_reduce_tensor(loss, aggregate="mean")
 
         if return_aux_data:
             aux_data = {
@@ -229,7 +291,8 @@ def compute_loss(
 
     @torch.no_grad()
     def log_statistics(self):
-        return {}
+        assert self.dataset_average_activation_norm is not None
+        return {f"info/{k}": v for k, v in self.dataset_average_activation_norm.items()}
 
     def initialize_with_same_weight_across_layers(self):
         self.encoder.weight.data = get_tensor_from_specific_rank(self.encoder.weight.data.clone(), src=0)