fix(mixcoder): fix topk activation func

Frankstein73 · dest1n1s · commit 8d579649ce2a · 2025-01-23T15:31:26.000+08:00
diff --git a/src/lm_saes/mixcoder.py b/src/lm_saes/mixcoder.py
@@ -159,7 +159,7 @@ def get_modality_index(self) -> dict[str, tuple[int, int]]:
         """
         return self.modality_index
 
-    def _get_modality_activation(
+    def _get_modality_activation_mask(
         self,
         activation: Union[
             Float[torch.Tensor, "batch d_model"],
@@ -182,7 +182,7 @@ def _get_modality_activation(
             The activation of the specified modality. The shape is the same as the input activation.
         """
         activation_mask = torch.isin(tokens, self.modality_indices[modality])
-        return activation_mask.unsqueeze(1) * activation
+        return activation_mask.unsqueeze(1)
 
     @overload
     def encode(
@@ -266,7 +266,7 @@ def encode(
             if modality == "shared":
                 # shared modality is not encoded directly but summed up during other modalities' encoding
                 continue
-            x_modality = self._get_modality_activation(x, tokens, modality)
+            activation_mask = self._get_modality_activation_mask(x, tokens, modality)
             if self.cfg.use_decoder_bias and self.cfg.apply_decoder_bias_to_pre_encoder:
                 modality_bias = (
                     self.decoder[modality].bias.to_local()  # TODO: check if this is correct # type: ignore
@@ -278,15 +278,15 @@ def encode(
                     if isinstance(self.decoder["shared"].bias, DTensor)
                     else self.decoder["shared"].bias
                 )
-                x_modality = x_modality - modality_bias - shared_bias
+                x = x - modality_bias - shared_bias
 
-            hidden_pre_modality = self.encoder[modality](x_modality)
-            hidden_pre_shared = self.encoder["shared"](x_modality)
+            hidden_pre_modality = self.encoder[modality](x)
+            hidden_pre_shared = self.encoder["shared"](x)
 
             if self.cfg.use_glu_encoder:
-                hidden_pre_modality_glu = torch.sigmoid(self.encoder_glu[modality](x_modality))
+                hidden_pre_modality_glu = torch.sigmoid(self.encoder_glu[modality](x))
                 hidden_pre_modality = hidden_pre_modality * hidden_pre_modality_glu
-                hidden_pre_shared_glu = torch.sigmoid(self.encoder_glu["shared"](x_modality))
+                hidden_pre_shared_glu = torch.sigmoid(self.encoder_glu["shared"](x))
                 hidden_pre_shared = hidden_pre_shared * hidden_pre_shared_glu
 
             if self.cfg.sparsity_include_decoder_norm:
@@ -296,7 +296,9 @@ def encode(
                 true_feature_acts_modality = hidden_pre_modality
                 true_feature_acts_shared = hidden_pre_shared
 
-            true_feature_acts_concat = torch.cat([true_feature_acts_modality, true_feature_acts_shared], dim=1)
+            true_feature_acts_concat = (
+                torch.cat([true_feature_acts_modality, true_feature_acts_shared], dim=1) * activation_mask
+            )
             activation_mask_concat = self.activation_function(true_feature_acts_concat)
             feature_acts_concat = true_feature_acts_concat * activation_mask_concat
 
@@ -313,6 +315,7 @@ def encode(
 
         hidden_pre = self.hook_hidden_pre(hidden_pre)
         feature_acts = self.hook_feature_acts(feature_acts)
+        # assert torch.all((feature_acts > 0).sum(-1) <= self.current_k)
         if return_hidden_pre:
             return feature_acts, hidden_pre
         return feature_acts
diff --git a/src/lm_saes/sae.py b/src/lm_saes/sae.py
@@ -109,7 +109,6 @@ def topk_activation(x: torch.Tensor):
                 k = x.shape[-1] - self.current_k + 1
                 k_th_value, _ = torch.kthvalue(x, k=k, dim=-1)
                 k_th_value = k_th_value.unsqueeze(dim=1)
-                print()
                 return x.ge(k_th_value)
 
             return topk_activation
diff --git a/tests/unit/test_mixcoder.py b/tests/unit/test_mixcoder.py
@@ -12,11 +12,13 @@ def config():
         modalities={"text": 2, "image": 3, "shared": 4},
         device="cpu",
         dtype=torch.float32,
-        use_glu_encoder=False,
+        use_glu_encoder=True,
         use_decoder_bias=True,
         hook_point_in="hook_point_in",
         hook_point_out="hook_point_out",
         expansion_factor=1.0,
+        top_k=2,
+        act_fn="topk",
     )
 
 
@@ -32,6 +34,12 @@ def modality_indices():
 def mixcoder(config, modality_indices):
     model = MixCoder(config)
     model.init_parameters(modality_indices=modality_indices)
+    model.decoder["text"].bias.data = torch.rand_like(model.decoder["text"].bias.data)
+    model.decoder["image"].bias.data = torch.rand_like(model.decoder["image"].bias.data)
+    model.decoder["shared"].bias.data = torch.rand_like(model.decoder["shared"].bias.data)
+    model.encoder["text"].bias.data = torch.rand_like(model.encoder["text"].bias.data)
+    model.encoder["image"].bias.data = torch.rand_like(model.encoder["image"].bias.data)
+    model.encoder["shared"].bias.data = torch.rand_like(model.encoder["shared"].bias.data)
     return model
 
 
@@ -80,6 +88,7 @@ def test_encode_decode(mixcoder, config):
         ),
         feature_acts[:, slice(*modality_index["shared"])],
     )
+    print(feature_acts)
 
     # Test decode
     reconstructed = mixcoder.decode(feature_acts)
@@ -95,18 +104,18 @@ def test_encode_decode(mixcoder, config):
     assert torch.allclose(reconstructed_image[4:, :], reconstructed[4:, :])
 
 
-def test_get_modality_activation(mixcoder, config):
+def test_get_modality_activation_mask(mixcoder, config):
     """Test the _get_modality_activation method."""
     batch_size = 8
     x = torch.ones(batch_size, config.d_model)
     tokens = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])
 
     # Test text modality
-    text_activation = mixcoder._get_modality_activation(x, tokens, "text")
-    assert torch.all(text_activation[0, :4] == 1)  # First 4 positions should be 1
-    assert torch.all(text_activation[0, 4:] == 0)  # Last 4 positions should be 0
+    text_activation_mask = mixcoder._get_modality_activation_mask(x, tokens, "text")
+    assert torch.all(text_activation_mask[0, :4] == 1)  # First 4 positions should be 1
+    assert torch.all(text_activation_mask[0, 4:] == 0)  # Last 4 positions should be 0
 
     # Test image modality
-    image_activation = mixcoder._get_modality_activation(x, tokens, "image")
-    assert torch.all(image_activation[1, :4] == 0)  # First 4 positions should be 0
-    assert torch.all(image_activation[1, 4:] == 1)  # Last 4 positions should be 1
+    image_activation_mask = mixcoder._get_modality_activation_mask(x, tokens, "image")
+    assert torch.all(image_activation_mask[1, :4] == 0)  # First 4 positions should be 0
+    assert torch.all(image_activation_mask[1, 4:] == 1)  # Last 4 positions should be 1