backward compat for btopK sae, support for btopk sae finetuning, fix licence config

Butanium · Butanium · commit 48debb2707c7 · 2025-06-27T15:08:49.000Z
diff --git a/dictionary_learning/dictionary.py b/dictionary_learning/dictionary.py
@@ -667,6 +667,29 @@ def from_pretrained(
 
         state_dict = th.load(path, weights_only=True)
         dict_size, activation_dim = state_dict["encoder.weight"].shape
+        normalization_keys = [
+            "target_rms",
+            "activation_mean",
+            "activation_std",
+            "activation_global_scale",
+        ]
+        is_in_dict = th.tensor([k in state_dict for k in normalization_keys])
+        if not is_in_dict.all():
+            if is_in_dict.any():
+                raise ValueError(
+                    f"Some normalization keys are present in the state dict but not all. Missing keys: {[n for n in normalization_keys if n not in state_dict]}"
+                )
+            else:
+                warn(
+                    "No normalization keys found in the state dict. Assuming no normalization is needed. This is normal for old dictionaries."
+                )
+                for key in normalization_keys:
+                    state_dict[key] = (
+                        th.full((activation_dim,), th.nan)
+                        if key in ["activation_mean", "activation_std"]
+                        else th.tensor(th.nan)
+                    )
+
         if k is None:
             k = state_dict["k"].item()
         elif "k" in state_dict and k != state_dict["k"].item():
diff --git a/dictionary_learning/trainers/batch_top_k.py b/dictionary_learning/trainers/batch_top_k.py
@@ -21,6 +21,7 @@ def __init__(
         layer: int,
         lm_name: str,
         dict_class: type = BatchTopKSAE,
+        pretrained_ae: Optional[BatchTopKSAE] = None,
         lr: Optional[float] = None,
         auxk_alpha: float = 1 / 32,
         warmup_steps: int = 1000,
@@ -33,7 +34,7 @@ def __init__(
         activation_mean: Optional[t.Tensor] = None,
         activation_std: Optional[t.Tensor] = None,
         target_rms: float = 1.0,
-        encoder_init_norm: str = 1.0,
+        encoder_init_norm: float = 1.0,
     ):
         super().__init__(seed)
         assert layer is not None and lm_name is not None
@@ -51,15 +52,18 @@ def __init__(
             t.manual_seed(seed)
             t.cuda.manual_seed_all(seed)
 
-        self.ae = dict_class(
-            activation_dim,
-            dict_size,
-            k,
-            activation_mean=activation_mean,
-            activation_std=activation_std,
-            target_rms=target_rms,
-            encoder_init_norm=encoder_init_norm,
-        )
+        if pretrained_ae is None:
+            self.ae = dict_class(
+                activation_dim,
+                dict_size,
+                k,
+                activation_mean=activation_mean,
+                activation_std=activation_std,
+                target_rms=target_rms,
+                encoder_init_norm=encoder_init_norm,
+            )
+        else:
+            self.ae = pretrained_ae
 
         if device is None:
             self.device = "cuda" if t.cuda.is_available() else "cpu"
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,14 +3,17 @@ requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"]
 build-backend = "setuptools.build_meta"
 [tool.setuptools_scm]
 
+[tool.setuptools.packages.find]
+include = ["dictionary_learning*"]
+exclude = ["junk*"]
+
 [project]
 dynamic = ["version"]
 name = "dictionary_learning"
 description = "A package for dictionary learning via sparse autoencoders on neural network activations"
 readme = "README.md"
 keywords = ["dictionary learning", "sparse autoencoder", "neural networks"]
-
-license = { text = "MIT" }
+license-files = ["LICENSE"]
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",