Fix FactoredMatrix indexing returning empty result for -1 index (#1340)

Kymi808 · brendanlong · jlarson4 · web-flow · commit b3b993488647 · 2026-05-28T09:05:56.000-05:00
* Fix type of HookedTransformerConfig.device (#1230) * Fix type of HookedTransformerConfig.device This is typed as `Optional[str]` but sometimes returns `torch.device`. Updated the code to just return the `str` instead of wrapping with a device. I'm not confident that every function which takes a device will always be passed a string, so I didn't change functions like warn_if_mps. Found while working on #1219 * more cleanup * 3.0 CI Bugs (#1261) * Fixing `utils` imports * skip gated notebooks on PR from forks * Updating notebooks * Ensure LLaMA only runs when HF_TOKEN is available --------- Co-authored-by: jlarson4 <jonahalarson@comcast.net> * Fix TransformerBridge backward hook cleanup (#1324) * Fix TransformerBridge backward hook cleanup * Preserve backward hooks in run_with_cache * Fix FactoredMatrix indexing returning empty result for -1 index FactoredMatrix.__getitem__ converts an integer index `v` into the matrix (ldim/rdim) dimensions to `slice(v, v + 1)`. For `v == -1` this becomes `slice(-1, 0)`, which is an empty slice, so indexing the last row/column with a negative index silently returns a (0, ...) tensor instead of the requested element. Other negative indices (-2, -3, ...) are unaffected because `v + 1` stays negative. Use `None` as the slice stop when `v == -1` so the final element is kept. Adds regression tests covering negative indices on each matrix dimension. --------- Co-authored-by: Brendan Long <self@brendanlong.com> Co-authored-by: jlarson4 <jonahalarson@comcast.net> Co-authored-by: Samuele_Punzo <90847990+SamuelePunzo@users.noreply.github.com>
diff --git a/tests/acceptance/model_bridge/compatibility/test_backward_hooks.py b/tests/acceptance/model_bridge/compatibility/test_backward_hooks.py
@@ -51,3 +51,42 @@ def sum_bridge_grads(grad, hook=None):
             f"Gradient sums should be identical but differ by "
             f"{abs(hooked_grad_sum - bridge_grad_sum).item():.6f}"
         )
+
+
+def test_transformer_bridge_hooks_context_cleans_up_backward_hooks(
+    gpt2_hooked_unprocessed, gpt2_bridge_compat_no_processing
+):
+    """Regression test for backward-hook cleanup on context exit."""
+    hooked_model = gpt2_hooked_unprocessed
+    bridge_model = gpt2_bridge_compat_no_processing
+    hooked_hook = hooked_model.blocks[0].hook_resid_post
+    bridge_hook = bridge_model.blocks[0].hook_resid_post
+    test_input = torch.tensor([[1, 2, 3]])
+
+    def noop_backward_hook(grad, hook=None):
+        return None
+
+    hooked_model.zero_grad()
+    with hooked_model.hooks(bwd_hooks=[("blocks.0.hook_resid_post", noop_backward_hook)]):
+        hooked_model(test_input).sum().backward()
+
+    bridge_model.zero_grad()
+    with bridge_model.hooks(bwd_hooks=[("blocks.0.hook_resid_post", noop_backward_hook)]):
+        bridge_model(test_input).sum().backward()
+
+    assert not hooked_hook.has_hooks(dir="bwd", including_permanent=False)
+    assert not bridge_hook.has_hooks(dir="bwd", including_permanent=False)
+
+
+def test_transformer_bridge_reset_hooks_removes_backward_hooks(gpt2_bridge_compat_no_processing):
+    """Regression test for bridge reset_hooks removing backward hooks."""
+    bridge_model = gpt2_bridge_compat_no_processing
+    backward_hook = bridge_model.blocks[0].hook_resid_post
+
+    backward_hook.add_hook(lambda grad, hook=None: None, dir="bwd")
+
+    assert backward_hook.has_hooks(dir="bwd", including_permanent=False)
+
+    bridge_model.reset_hooks()
+
+    assert not backward_hook.has_hooks(dir="bwd", including_permanent=False)
diff --git a/tests/acceptance/model_bridge/compatibility/test_run_with_cache.py b/tests/acceptance/model_bridge/compatibility/test_run_with_cache.py
@@ -77,3 +77,21 @@ def test_run_with_cache_accepts_1d_tensor(self, gpt2_bridge_compat_no_processing
         assert torch.allclose(
             cache_1d["blocks.0.hook_mlp_out"], cache_2d["blocks.0.hook_mlp_out"], atol=1e-5
         )
+
+
+def test_transformer_bridge_run_with_cache_preserves_existing_backward_hooks(
+    gpt2_bridge_compat_no_processing,
+):
+    """run_with_cache should not remove unrelated backward hooks on the same HookPoint."""
+    bridge_model = gpt2_bridge_compat_no_processing
+    target_hook = bridge_model.blocks[0].hook_resid_post
+
+    target_hook.add_hook(lambda grad, hook=None: None, dir="bwd")
+
+    assert target_hook.has_hooks(dir="bwd", including_permanent=False)
+
+    bridge_model.run_with_cache(torch.tensor([[1, 2, 3]]), names_filter="blocks.0.hook_resid_post")
+
+    assert target_hook.has_hooks(dir="bwd", including_permanent=False)
+
+    bridge_model.reset_hooks()
diff --git a/tests/unit/factored_matrix/test_get_item.py b/tests/unit/factored_matrix/test_get_item.py
@@ -51,6 +51,25 @@ def test_index_dimension_get_element(sample_factored_matrix):
     assert_close(result.AB.squeeze(), sample_factored_matrix.AB[0, 0, 0, 0, 1])
 
 
+def test_index_dimension_get_line_negative(sample_factored_matrix):
+    # Negative index into the row (ldim) of the matrix. `idx == -1` previously
+    # produced an empty slice(-1, 0) and returned a (0, ...) tensor.
+    result = sample_factored_matrix[0, 0, 0, -1]
+    assert_close(result.AB.squeeze(), sample_factored_matrix.AB[0, 0, 0, -1])
+
+
+def test_index_dimension_get_element_negative(sample_factored_matrix):
+    # Negative index into the column (rdim) of the matrix.
+    result = sample_factored_matrix[0, 0, 0, 0, -1]
+    assert_close(result.AB.squeeze(), sample_factored_matrix.AB[0, 0, 0, 0, -1])
+
+
+def test_index_dimension_get_element_both_negative(sample_factored_matrix):
+    # Negative index into both matrix dimensions at once.
+    result = sample_factored_matrix[0, 0, 0, -1, -1]
+    assert_close(result.AB.squeeze(), sample_factored_matrix.AB[0, 0, 0, -1, -1])
+
+
 def test_index_dimension_too_big(sample_factored_matrix):
     with pytest.raises(Exception):
         _ = sample_factored_matrix[1, 1, 1, 1, 1, 1]
diff --git a/transformer_lens/FactoredMatrix.py b/transformer_lens/FactoredMatrix.py
@@ -288,7 +288,12 @@ def _convert_to_slice(self, sequence: Union[Tuple, List], idx: int) -> Tuple:
         if isinstance(idx, int):
             sequence = list(sequence)
             if isinstance(sequence[idx], int):
-                sequence[idx] = slice(sequence[idx], sequence[idx] + 1)
+                value = sequence[idx]
+                # `value + 1` selects the single requested element, except when
+                # value == -1: there `value + 1 == 0` yields the empty slice(-1, 0).
+                # Use `None` as the stop so the final element is kept.
+                stop = value + 1 if value != -1 else None
+                sequence[idx] = slice(value, stop)
             sequence = tuple(sequence)
 
         return sequence
diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py
@@ -2114,7 +2114,7 @@ def stop_hook(tensor: torch.Tensor, *, hook: Any) -> torch.Tensor:
             raise e
         finally:
             for hp, _ in hooks:
-                hp.remove_hooks()
+                hp.remove_hooks(dir="fwd")
         if self.compatibility_mode == True:
             reverse_aliases = {}
             for old_name, new_name in aliases.items():
@@ -2181,7 +2181,7 @@ def run_with_hooks(
         Returns:
             Model output
         """
-        added_hooks: List[Tuple[HookPoint, str]] = []
+        added_hooks: List[Tuple[HookPoint, Literal["fwd", "bwd"]]] = []
         effective_stop_layer = None
         if stop_at_layer is not None and hasattr(self, "blocks"):
             if stop_at_layer < 0:
@@ -2207,7 +2207,7 @@ def add_hook_to_point(
                 hook_point.add_hook(hook_fn, dir=dir, alias_names=alias_names_list)
             else:
                 hook_point.add_hook(hook_fn, dir=dir)
-            added_hooks.append((hook_point, name))
+            added_hooks.append((hook_point, dir))
 
         if stop_at_layer is not None and hasattr(self, "blocks"):
             if stop_at_layer < 0:
@@ -2276,8 +2276,8 @@ def wrapped_hook_fn(tensor, hook, _orig_fn=original_hook_fn):
             return output
         finally:
             if reset_hooks_end:
-                for hook_point, name in added_hooks:
-                    hook_point.remove_hooks()
+                for hook_point, direction in added_hooks:
+                    hook_point.remove_hooks(dir=direction)
 
     def _generate_tokens(
         self,
@@ -3452,7 +3452,7 @@ def hooks(self, fwd_hooks=[], bwd_hooks=[], reset_hooks_end=True, clear_contexts
 
         @contextmanager
         def _hooks_context():
-            added_hooks: List[Tuple[HookPoint, str]] = []
+            added_hooks: List[Tuple[HookPoint, Literal["fwd", "bwd"]]] = []
 
             def add_hook_to_point(
                 hook_point: HookPoint,
@@ -3468,7 +3468,7 @@ def add_hook_to_point(
                     hook_point.add_hook(hook_fn, dir=dir, alias_names=alias_names_list)
                 else:
                     hook_point.add_hook(hook_fn, dir=dir)
-                added_hooks.append((hook_point, name))
+                added_hooks.append((hook_point, dir))
 
             def apply_hooks(hooks: List[Tuple[Union[str, Callable], Callable]], is_fwd: bool):
                 direction: Literal["fwd", "bwd"] = "fwd" if is_fwd else "bwd"
@@ -3501,8 +3501,8 @@ def apply_hooks(hooks: List[Tuple[Union[str, Callable], Callable]], is_fwd: bool
                 yield self
             finally:
                 if reset_hooks_end:
-                    for hook_point, name in added_hooks:
-                        hook_point.remove_hooks()
+                    for hook_point, direction in added_hooks:
+                        hook_point.remove_hooks(dir=direction)
 
         return _hooks_context()
 
diff --git a/transformer_lens/model_bridge/generalized_components/base.py b/transformer_lens/model_bridge/generalized_components/base.py
@@ -188,12 +188,12 @@ def remove_hooks(self, hook_name: str | None = None) -> None:
             hook_name: Name of the hook point to remove. If None, removes all hooks.
         """
         if hook_name is None:
-            self.hook_in.remove_hooks()
-            self.hook_out.remove_hooks()
+            self.hook_in.remove_hooks(dir="both")
+            self.hook_out.remove_hooks(dir="both")
         elif hook_name == "output":
-            self.hook_out.remove_hooks()
+            self.hook_out.remove_hooks(dir="both")
         elif hook_name == "input":
-            self.hook_in.remove_hooks()
+            self.hook_in.remove_hooks(dir="both")
         else:
             raise ValueError(
                 f"Hook name '{hook_name}' not supported. Supported names are 'output' and 'input'."