Release: 2026-01-14 (#76)

hannamw · web-flow · commit e09b5f36eb4c · 2026-01-14T09:11:27.000+01:00
Squashed changes from error-vector-fix branch.
diff --git a/circuit_tracer/replacement_model/replacement_model_nnsight.py b/circuit_tracer/replacement_model/replacement_model_nnsight.py
@@ -303,7 +303,7 @@ def fetch_activations(
             gemma_3_it = "gemma-3" in self.cfg.model_name and self.cfg.model_name.endswith("-it")
             overlap = 0
             if gemma_3_it:
-                input_ids = self.input.squeeze(0)
+                input_ids = self.input
                 ignore_prefix = torch.tensor(
                     [2, 105, 2364, 107], dtype=input_ids.dtype, device=input_ids.device
                 )
@@ -541,7 +541,7 @@ def setup_attribution(self, inputs: str | torch.Tensor):
         # Compute error vectors
         error_vectors = mlp_out_cache - attribution_data["reconstruction"]
 
-        error_vectors[:, 0] = 0
+        error_vectors[:, zero_positions] = 0
         token_vectors = self.embed_weight[  # type: ignore
             tokens
         ].detach()  # (n_pos, d_model)  # type: ignore
diff --git a/circuit_tracer/transcoder/cross_layer_transcoder.py b/circuit_tracer/transcoder/cross_layer_transcoder.py
@@ -281,7 +281,9 @@ def select_decoder_vectors(self, features):
 
         return pos_ids, layer_ids, feat_ids, decoder_vectors, encoder_mapping
 
-    def compute_reconstruction(self, pos_ids, layer_ids, decoder_vectors):
+    def compute_reconstruction(
+        self, pos_ids, layer_ids, decoder_vectors, input_acts: torch.Tensor | None = None
+    ):
         n_pos = pos_ids.max() + 1
         flat_idx = layer_ids * n_pos + pos_ids
         recon = torch.zeros(
@@ -290,11 +292,17 @@ def compute_reconstruction(self, pos_ids, layer_ids, decoder_vectors):
             device=decoder_vectors.device,
             dtype=decoder_vectors.dtype,
         ).index_add_(0, flat_idx, decoder_vectors)
-        return recon.reshape(self.n_layers, n_pos, self.d_model) + self.b_dec[:, None]
+        recon = recon.reshape(self.n_layers, n_pos, self.d_model) + self.b_dec[:, None]
+        if self.W_skip is not None:
+            assert input_acts is not None, (
+                "Transcoder has skip connection but no input_acts were provided"
+            )
+            recon = recon + input_acts @ self.W_skip
+        return recon
 
-    def decode(self, features):
+    def decode(self, features, input_acts: torch.Tensor | None = None):
         pos_ids, layer_ids, feat_ids, decoder_vectors, _ = self.select_decoder_vectors(features)
-        return self.compute_reconstruction(pos_ids, layer_ids, decoder_vectors)
+        return self.compute_reconstruction(pos_ids, layer_ids, decoder_vectors, input_acts)
 
     def compute_skip(self, layer_id: int, inputs):
         if self.W_skip is not None:
@@ -330,7 +338,7 @@ def compute_attribution_components(self, inputs, zero_positions: slice = slice(0
         pos_ids, layer_ids, feat_ids, decoder_vectors, encoder_to_decoder_map = (
             self.select_decoder_vectors(features)
         )
-        reconstruction = self.compute_reconstruction(pos_ids, layer_ids, decoder_vectors)
+        reconstruction = self.compute_reconstruction(pos_ids, layer_ids, decoder_vectors, inputs)
 
         return {
             "activation_matrix": features,
diff --git a/circuit_tracer/transcoder/single_layer_transcoder.py b/circuit_tracer/transcoder/single_layer_transcoder.py
@@ -124,9 +124,15 @@ def encode(self, input_acts, apply_activation_function: bool = True):
             return pre_acts
         return self.activation_function(pre_acts)
 
-    def decode(self, acts):
+    def decode(self, acts, input_acts: torch.Tensor | None = None):
         W_dec = self.W_dec
-        return acts @ W_dec + self.b_dec
+        reconstruction = acts @ W_dec + self.b_dec
+        if self.W_skip is not None:
+            assert input_acts is not None, (
+                "Transcoder has skip connection but no input_acts were provided"
+            )
+            reconstruction = reconstruction + self.compute_skip(input_acts)
+        return reconstruction
 
     def compute_skip(self, input_acts):
         if self.W_skip is not None:
@@ -136,13 +142,9 @@ def compute_skip(self, input_acts):
 
     def forward(self, input_acts):
         transcoder_acts = self.encode(input_acts)
-        decoded = self.decode(transcoder_acts)
-        decoded = decoded.detach()
-        decoded.requires_grad = True
-
-        if self.W_skip is not None:
-            skip = self.compute_skip(input_acts)
-            decoded = decoded + skip
+        decoded = self.decode(transcoder_acts, input_acts)
+        # decoded = decoded.detach()
+        # decoded.requires_grad = True
 
         return decoded
 
@@ -169,7 +171,7 @@ def encode_sparse(self, input_acts, zero_positions: slice = slice(0, 1)):
 
         return sparse_acts, active_encoders
 
-    def decode_sparse(self, sparse_acts):
+    def decode_sparse(self, sparse_acts, input_acts: torch.Tensor | None = None):
         """Decode sparse activations and return reconstruction with scaled decoder vectors.
 
         Returns:
@@ -189,6 +191,11 @@ def decode_sparse(self, sparse_acts):
             n_pos, self.d_model, device=sparse_acts.device, dtype=sparse_acts.dtype
         )
         reconstruction = reconstruction.index_add_(0, pos_idx, scaled_decoders)
+        if self.W_skip is not None:
+            assert input_acts is not None, (
+                "Transcoder has skip connection but no input_acts were provided"
+            )
+            reconstruction = reconstruction + self.compute_skip(input_acts)
         reconstruction = reconstruction + self.b_dec
 
         return reconstruction, scaled_decoders
@@ -319,9 +326,12 @@ def select_decoder_vectors(self, features):
             encoder_mapping,
         )
 
-    def decode(self, acts):
+    def decode(self, acts, input_acts: torch.Tensor | None):
         return torch.stack(
-            [transcoder.decode(acts[i]) for i, transcoder in enumerate(self.transcoders)],  # type: ignore
+            [
+                transcoder.decode(acts[i], None if input_acts is None else input_acts[i])
+                for i, transcoder in enumerate[SingleLayerTranscoder](self.transcoders)  # type: ignore
+            ],
             dim=0,
         )
 
@@ -349,11 +359,13 @@ def compute_attribution_components(
         decoder_vectors = []
         sparse_acts_list = []
 
-        for layer, transcoder in enumerate(self.transcoders):
-            sparse_acts, active_encoders = transcoder.encode_sparse(  # type: ignore
+        for layer, transcoder in enumerate[SingleLayerTranscoder](self.transcoders):  # type: ignore
+            sparse_acts, active_encoders = transcoder.encode_sparse(
                 mlp_inputs[layer], zero_positions=zero_positions
             )
-            reconstruction[layer], active_decoders = transcoder.decode_sparse(sparse_acts)  # type: ignore
+            reconstruction[layer], active_decoders = transcoder.decode_sparse(
+                sparse_acts, mlp_inputs[layer]
+            )
             encoder_vectors.append(active_encoders)
             decoder_vectors.append(active_decoders)
             sparse_acts_list.append(sparse_acts)
diff --git a/tests/test_attributions_gemma3_nnsight.py b/tests/test_attributions_gemma3_nnsight.py
@@ -149,13 +149,13 @@ def verify_token_and_error_edges(
     act_rtol=1e-3,
     logit_atol=1e-5,
     logit_rtol=1e-3,
+    pos_start=1,
 ):
     s = graph.input_tokens
     adjacency_matrix = graph.adjacency_matrix.to(device=model.device, dtype=model.dtype)
     active_features = graph.active_features.to(device=model.device)
     logit_tokens = graph.logit_tokens.to(device=model.device)
     total_active_features = active_features.size(0)
-    pos_start = 1  # skip first position (BOS token)
 
     ctx = model.setup_attribution(s)
 
@@ -525,6 +525,24 @@ def test_gemma_3_1b():
         verify_feature_edges(model, graph)
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_gemma_3_1b_it():
+    s = "<bos><start_of_turn>user\nThe National Digital Analytics Group (ND"
+    model = ReplacementModel.from_pretrained(
+        "google/gemma-3-1b-it",
+        "mwhanna/gemma-scope-2-1b-it/transcoder_all/width_16k_l0_small_affine",
+        dtype=torch.float32,
+        backend="nnsight",
+    )
+    graph = attribute(s, model)
+    assert isinstance(model, NNSightReplacementModel)
+
+    print("Changing logit softcap to 0, as the logits will otherwise be off.")
+    with model.zero_softcap():
+        verify_token_and_error_edges(model, graph, pos_start=4)
+        verify_feature_edges(model, graph)
+
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_gemma_3_1b_clt():
     s = "The National Digital Analytics Group (ND"
@@ -569,5 +587,6 @@ def test_gemma_3_4b():
     test_gemma3_with_dummy_transcoders()
     test_gemma3_with_dummy_clt()
     test_gemma_3_1b()
+    test_gemma_3_1b_it()
     test_gemma_3_1b_clt()
     test_gemma_3_4b()
diff --git a/tests/test_attributions_llama.py b/tests/test_attributions_llama.py
@@ -225,8 +225,22 @@ def test_llama_3_2_1b():
     verify_feature_edges(model, graph)
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_llama_3_2_1b_clt():
+    s = "The National Digital Analytics Group (ND"
+    model = ReplacementModel.from_pretrained(
+        "meta-llama/Llama-3.2-1B", "mntss/clt-llama-3.2-1b-524k"
+    )
+    assert isinstance(model, TransformerLensReplacementModel)
+    graph = attribute(s, model, batch_size=128)
+
+    verify_token_and_error_edges(model, graph)
+    verify_feature_edges(model, graph)
+
+
 if __name__ == "__main__":
     torch.manual_seed(42)
     test_small_llama_model()
     test_large_llama_model()
     test_llama_3_2_1b()
+    test_llama_3_2_1b_clt()
diff --git a/tests/test_cross_layer_transcoder.py b/tests/test_cross_layer_transcoder.py
@@ -22,7 +22,7 @@ def cleanup_cuda():
 def create_test_clt_files():
     """Create temporary CLT safetensors files for testing."""
 
-    def _create_files(n_layers=4, d_model=128, d_transcoder=512):
+    def _create_files(n_layers=4, d_model=128, d_transcoder=512, skip_connection=False):
         tmpdir = tempfile.mkdtemp()
 
         # Create encoder and decoder files for each layer
@@ -41,6 +41,11 @@ def _create_files(n_layers=4, d_model=128, d_transcoder=512):
             dec_path = os.path.join(tmpdir, f"W_dec_{i}.safetensors")
             save_file(dec_dict, dec_path)
 
+        if skip_connection:
+            skip_dict = {"W_skip": torch.randn(d_model, d_model)}
+            skip_path = os.path.join(tmpdir, "W_skip.safetensors")
+            save_file(skip_dict, skip_path)
+
         return tmpdir
 
     return _create_files
@@ -49,9 +54,10 @@ def _create_files(n_layers=4, d_model=128, d_transcoder=512):
 # === Attribution Tests ===
 
 
-def test_compute_attribution_components(create_test_clt_files):
+@pytest.mark.parametrize("skip_connection", [False, True])
+def test_compute_attribution_components(create_test_clt_files, skip_connection):
     """Test the main attribution functionality of CLT."""
-    clt_path = create_test_clt_files()
+    clt_path = create_test_clt_files(skip_connection=skip_connection)
     clt = load_clt(
         clt_path,
         device=torch.device("cpu"),
@@ -64,7 +70,7 @@ def test_compute_attribution_components(create_test_clt_files):
     inputs = torch.randn(clt.n_layers, n_pos, clt.d_model, dtype=clt.b_enc.dtype)
 
     # Compute attribution components
-    components = clt.compute_attribution_components(inputs)
+    components = clt.compute_attribution_components(inputs, zero_positions=slice(0, 1))
 
     # Verify all required components are present
     assert "activation_matrix" in components
@@ -79,9 +85,10 @@ def test_compute_attribution_components(create_test_clt_files):
     assert act_matrix.is_sparse
     assert act_matrix.shape == (clt.n_layers, n_pos, clt.d_transcoder)
 
-    # Check reconstruction
+    # Check reconstruction (only positions 1 and beyond)
     reconstruction = components["reconstruction"]
     assert reconstruction.shape == (clt.n_layers, n_pos, clt.d_model)
+    assert torch.allclose(reconstruction[:, 1:], clt(inputs)[:, 1:])
 
     # Check encoder/decoder vectors have consistent counts
     n_active_encoders = act_matrix._nnz()
@@ -93,7 +100,7 @@ def test_compute_attribution_components(create_test_clt_files):
 
     # Check decoder locations
     decoder_locs = components["decoder_locations"]
-    assert decoder_locs.shape[0] == 2  # layer and position indices
+    assert decoder_locs.shape[0] == 2
 
 
 def test_encode_sparse_with_lazy_encoder(create_test_clt_files):
diff --git a/tests/test_single_layer_transcoder.py b/tests/test_single_layer_transcoder.py
@@ -23,7 +23,7 @@ def cleanup_cuda():
 def create_test_transcoder_file():
     """Create a temporary transcoder safetensors file for testing."""
 
-    def _create_file(d_model=128, d_sae=512):
+    def _create_file(d_model=128, d_sae=512, skip_connection=False):
         with tempfile.NamedTemporaryFile(suffix=".safetensors", delete=False) as f:
             W_enc = torch.randn(d_sae, d_model)
             W_dec = torch.randn(d_sae, d_model)
@@ -37,6 +37,9 @@ def _create_file(d_model=128, d_sae=512):
                 "b_dec": b_dec,
             }
 
+            if skip_connection:
+                state_dict["W_skip"] = torch.randn(d_model, d_model)
+
             save_file(state_dict, f.name)
             return f.name, state_dict
 
@@ -58,13 +61,16 @@ def _create_and_track(*args, **kwargs):
 # === Attribution Tests ===
 
 
-def test_transcoder_set_attribution_components(create_test_transcoder_file):
+@pytest.mark.parametrize("skip_connection", [False, True])
+def test_transcoder_set_attribution_components(create_test_transcoder_file, skip_connection):
     """Test compute_attribution_components functionality."""
     # Create test files for multiple layers
     n_layers = 3
     paths = {}
     for layer in range(n_layers):
-        path, _ = create_test_transcoder_file(d_model=128, d_sae=512)
+        path, _ = create_test_transcoder_file(
+            d_model=128, d_sae=512, skip_connection=skip_connection
+        )
         paths[layer] = path
 
     transcoder_set = load_transcoder_set(
@@ -74,7 +80,7 @@ def test_transcoder_set_attribution_components(create_test_transcoder_file):
         feature_output_hook="hook_mlp_out",
         device=torch.device("cpu"),
         lazy_encoder=False,
-        lazy_decoder=True,  # Test with lazy decoder
+        lazy_decoder=True,
     )
 
     # Create test MLP inputs
@@ -83,7 +89,9 @@ def test_transcoder_set_attribution_components(create_test_transcoder_file):
     mlp_inputs = torch.randn(n_layers, n_pos, d_model)
 
     # Compute attribution components
-    components = transcoder_set.compute_attribution_components(mlp_inputs)
+    components = transcoder_set.compute_attribution_components(
+        mlp_inputs, zero_positions=slice(0, 1)
+    )
 
     # Verify all required components are present
     assert "activation_matrix" in components
@@ -98,9 +106,13 @@ def test_transcoder_set_attribution_components(create_test_transcoder_file):
     assert act_matrix.is_sparse
     assert act_matrix.shape == (n_layers, n_pos, 512)
 
-    # Check reconstruction
+    # Check reconstruction (only positions 1 and beyond)
     reconstruction = components["reconstruction"]
     assert reconstruction.shape == (n_layers, n_pos, d_model)
+    for layer, transcoder in enumerate(transcoder_set.transcoders):
+        assert torch.allclose(
+            reconstruction[layer, 1:], transcoder(mlp_inputs[layer])[1:], rtol=1e-4, atol=1e-4
+        )
 
     # Check encoder/decoder vectors have matching counts
     n_active = act_matrix._nnz()
@@ -110,7 +122,7 @@ def test_transcoder_set_attribution_components(create_test_transcoder_file):
 
     # Check decoder locations
     decoder_locs = components["decoder_locations"]
-    assert decoder_locs.shape == (2, n_active)  # layer and position indices
+    assert decoder_locs.shape == (2, n_active)
 
 
 def test_sparse_encode_decode(create_test_transcoder_file):
diff --git a/tests/transcoder/test_cross_layer_transcoder.py b/tests/transcoder/test_cross_layer_transcoder.py
diff --git a/tests/transcoder/test_single_layer_transcoder.py b/tests/transcoder/test_single_layer_transcoder.py