OpenMOSS
diff --git a/‎src/lm_saes/analysis/__init__.py‎
Lines changed: 4 additions & 3 deletions b/‎src/lm_saes/analysis/__init__.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/lm_saes/analysis/autointerp/__init__.py‎
Lines changed: 43 additions & 0 deletions b/‎src/lm_saes/analysis/autointerp/__init__.py‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎src/lm_saes/analysis/autointerp/autointerp_base.py‎
Lines changed: 242 additions & 0 deletions b/‎src/lm_saes/analysis/autointerp/autointerp_base.py‎
Lines changed: 242 additions & 0 deletions
diff --git a/‎src/lm_saes/analysis/autointerp/evaluation_prompts.py‎
Lines changed: 65 additions & 0 deletions b/‎src/lm_saes/analysis/autointerp/evaluation_prompts.py‎
Lines changed: 65 additions & 0 deletions
@@ -1,13 +1,14 @@
-from .direct_logit_attributor import DirectLogitAttributor
-from .feature_analyzer import FeatureAnalyzer
-from .feature_interpreter import (
+from lm_saes.analysis.autointerp import (
     AutoInterpConfig,
     ExplainerType,
     FeatureInterpreter,
     ScorerType,
     TokenizedSample,
 )
 
+from .direct_logit_attributor import DirectLogitAttributor
+from .feature_analyzer import FeatureAnalyzer
+
 __all__ = [
     "FeatureAnalyzer",
     "FeatureInterpreter",
 
@@ -0,0 +1,43 @@
+"""Prompt builders for auto-interpretation of SAE features.
+
+This package contains modules for generating prompts used in the auto-interpretation
+process, organized by purpose:
+- explanation_prompts: Prompts for generating feature explanations
+- evaluation_prompts: Prompts for evaluating feature explanations
+"""
+
+from .autointerp_base import (
+    AutoInterpConfig,
+    ExplainerType,
+    ScorerType,
+    Segment,
+    TokenizedSample,
+    process_token,
+)
+from .evaluation_prompts import (
+    generate_detection_prompt,
+    generate_fuzzing_prompt,
+)
+from .explanation_prompts import (
+    generate_explanation_prompt,
+    generate_explanation_prompt_neuronpedia,
+)
+from .feature_interpreter import (
+    FeatureInterpreter,
+)
+
+__all__ = [
+    "generate_explanation_prompt",
+    "generate_explanation_prompt_neuronpedia",
+    "generate_detection_prompt",
+    "generate_fuzzing_prompt",
+    "FeatureInterpreter",
+    "AutoInterpConfig",
+    "ExplainerType",
+    "ScorerType",
+    "Segment",
+    "TokenizedSample",
+    "process_token",
+    "FeatureInterpreter",
+]
+
@@ -0,0 +1,242 @@
+"""Utility classes and functions for auto-interpretation of SAE features.
+
+This module contains shared utilities used across the auto-interpretation system,
+including configuration, data structures, and helper functions.
+"""
+
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Optional
+
+import torch
+from pydantic import Field
+
+from lm_saes.config import BaseConfig
+from lm_saes.utils.logging import get_logger
+
+logger = get_logger("analysis.autointerp_utils")
+
+
+def process_token(token: str) -> str:
+    """Process a token string by replacing special characters.
+
+    Args:
+        token: The token string to process
+
+    Returns:
+        Processed token string with special characters replaced
+    """
+    return token.replace("\n", "⏎").replace("\t", "→").replace("\r", "↵")
+
+
+class ExplainerType(str, Enum):
+    """Types of LLM explainers supported."""
+
+    OPENAI = "openai"
+    NEURONPEDIA = "neuronpedia"
+
+
+class ScorerType(str, Enum):
+    """Types of explanation scoring methods."""
+
+    DETECTION = "detection"
+    FUZZING = "fuzzing"
+    GENERATION = "generation"
+    SIMULATION = "simulation"
+
+
+class AutoInterpConfig(BaseConfig):
+    """Configuration for automatic interpretation of SAE features."""
+
+    # LLM settings
+    explainer_type: ExplainerType = ExplainerType.OPENAI
+    openai_api_key: Optional[str] = None
+    openai_model: str = "gpt-3.5-turbo"
+    openai_base_url: Optional[str] = None
+    openai_proxy: Optional[str] = None
+
+    # Activation retrieval settings
+    n_activating_examples: int = 7
+    n_non_activating_examples: int = 20
+    activation_threshold: float = 0.7  # Threshold relative to max activation for highlighting tokens
+    max_length: int = 50
+
+    # Scoring settings
+    scorer_type: list[ScorerType] = Field(default_factory=lambda: [ScorerType.DETECTION, ScorerType.FUZZING])
+
+    # Detection settings
+    detection_n_examples: int = 5  # Number of examples to show for detection
+
+    # Fuzzing settings
+    fuzzing_n_examples: int = 5  # Number of examples to use for fuzzing
+    fuzzing_decile_correct: int = 5  # Number of correctly marked examples per decile
+    fuzzing_decile_incorrect: int = 2  # Number of incorrectly marked examples per decile
+
+    # Prompting settings
+    include_cot: bool = True  # Whether to use chain-of-thought prompting
+    overwrite_existing: bool = False  # Whether to overwrite existing interpretations
+
+
+@dataclass
+class Segment:
+    """A segment of text with its activation value."""
+
+    text: str
+    """The text of the segment."""
+
+    activation: float
+    """The activation value of the segment."""
+
+    def display(self, abs_threshold: float) -> str:
+        """Display the segment as a string with whether it's highlighted."""
+        if self.activation > abs_threshold:
+            return f"<<{self.text}>>"
+        else:
+            return self.text
+
+    def display_max(self, abs_threshold: float) -> str:
+        """Display the segment text if it exceeds the threshold."""
+        if self.activation > abs_threshold:
+            return f"{self.text}\n"
+        else:
+            return ""
+
+@dataclass
+class ZPatternSegment:
+    """Data for a z pattern of a single token."""
+
+    contributing_indices: list[int]
+    """The indices of the contributing tokens in the sequence."""
+    contributions: list[float]
+    """The contributions of the contributing tokens to the activation of the token."""
+    max_contribution: float
+    """The maximum contribution of the contributing tokens to the activation of the token."""
+
+@dataclass
+class TokenizedSample:
+    """A tokenized sample with its activation pattern organized into segments."""
+
+    segments: list[Segment]
+    """List of segments, each containing start/end positions and activation values."""
+
+    max_activation: float
+    """Global maximum activation value."""
+
+    z_pattern_data: dict[int, ZPatternSegment] | None = None
+
+    def display_highlighted(self, threshold: float = 0.7) -> str:
+        """Get the text with activating segments highlighted with << >> delimiters.
+
+        Args:
+            threshold: Threshold relative to max activation for highlighting
+
+        Returns:
+            Text with activating segments highlighted
+        """
+        highlighted_text = "".join([seg.display(threshold * self.max_activation) for seg in self.segments])
+        return highlighted_text
+
+    def display_plain(self) -> str:
+        """Get the text with all segments displayed."""
+        return "".join([seg.text for seg in self.segments])
+
+    def display_max(self, threshold: float = 0.7) -> str:
+        """Get the text with max activating tokens and their context."""
+        max_activation_text = ""
+        hash_ = {}
+        for i, seg in enumerate(self.segments):
+            if seg.activation > threshold * self.max_activation:
+                text = seg.text
+                if text != "" and hash_.get(text, None) is None:
+                    hash_[text] = 1
+                    prev_text = "".join([self.segments[idx].text for idx in range(max(0, i - 3), i)])
+                    if self.z_pattern_data is not None and i in self.z_pattern_data:
+                        z_pattern_segment = self.z_pattern_data[i]
+                        k_prev_tokens = [f"({process_token(''.join([self.segments[idx].text for idx in range(max(0, j - 3), j)]))}) {process_token(self.segments[j].text)}" 
+                                         for j, contribution in zip(z_pattern_segment.contributing_indices, z_pattern_segment.contributions)
+                                         if contribution > threshold * z_pattern_segment.max_contribution]
+                        contributing_text = f"[{'; '.join(k_prev_tokens)}] => "
+                        max_activation_text += contributing_text
+                    max_activation_text += f"({process_token(prev_text)}) {process_token(text)}\n"
+        return max_activation_text
+
+    def display_next(self, threshold: float = 0.7) -> str:
+        """Get the token immediately after the max activating token."""
+        next_activation_text = ""
+        hash_ = {}
+        Flag = False
+        for seg in self.segments:
+            if Flag:
+                text = seg.text
+                if text != "" and hash_.get(text, None) is None:
+                    hash_[text] = 1
+                    next_activation_text = process_token(text) + "\n"
+            if seg.activation > threshold * self.max_activation:
+                Flag = True
+            else:
+                Flag = False
+        return next_activation_text
+    
+    def add_z_pattern_data(
+        self,
+        z_pattern_indices: torch.Tensor,
+        z_pattern_values: torch.Tensor,
+        origins: list[dict[str, Any]]
+    ):
+        self.z_pattern_data = {}
+        activating_indices = z_pattern_indices[0].unique_consecutive()
+        for i in activating_indices:
+            if origins[i] is not None:
+                contributing_indices_mask = z_pattern_indices[0] == i
+                self.z_pattern_data[i.item()] = ZPatternSegment(
+                    contributing_indices=z_pattern_indices[1, contributing_indices_mask].tolist(),
+                    contributions=z_pattern_values[contributing_indices_mask].tolist(),
+                    max_contribution=z_pattern_values[contributing_indices_mask].max().item(),
+                )
+    
+    def has_z_pattern_data(self):
+        return self.z_pattern_data is not None
+
+    @staticmethod
+    def construct(
+        text: str,
+        activations: torch.Tensor,
+        origins: list[dict[str, Any]],
+        max_activation: float,
+    ) -> "TokenizedSample":
+        """Construct a TokenizedSample from text, activations, and origins.
+
+        Args:
+            text: The full text string
+            activations: Tensor of activation values
+            origins: List of origin dictionaries with position information
+            max_activation: Maximum activation value
+
+        Returns:
+            A TokenizedSample instance
+        """
+        positions: set[int] = set()
+        for origin in origins:
+            if origin and origin["key"] == "text":
+                assert "range" in origin, f"Origin {origin} does not have a range"
+                positions.add(origin["range"][0])
+                positions.add(origin["range"][1])
+
+        sorted_positions = sorted(positions)
+
+        segments = []
+        for i in range(len(sorted_positions) - 1):
+            start, end = sorted_positions[i], sorted_positions[i + 1]
+            try:
+                segment_activation = max(
+                    act
+                    for origin, act in zip(origins, activations)
+                    if origin and origin["key"] == "text" and origin["range"][0] >= start and origin["range"][1] <= end
+                )
+            except Exception as e:
+                logger.error(f"Error processing segment:\nstart={start}, end={end}, segment={text[start:end]}\n\n. Error: {e}")
+                continue
+            segments.append(Segment(text[start:end], segment_activation.item()))
+
+        return TokenizedSample(segments, max_activation)
+
@@ -0,0 +1,65 @@
+"""Prompt builders for evaluating feature explanations.
+
+This module contains functions for generating prompts used to evaluate SAE feature
+explanations, including detection and fuzzing evaluation methods.
+"""
+
+from typing import Any
+
+from lm_saes.analysis.autointerp.autointerp_base import AutoInterpConfig, TokenizedSample
+
+
+def generate_detection_prompt(
+    cfg: AutoInterpConfig,
+    explanation: dict[str, Any],
+    examples: list[TokenizedSample],
+) -> tuple[str, str]:
+    """Generate a prompt for detection evaluation.
+
+    Args:
+        cfg: Auto-interpretation configuration
+        explanation: The explanation to evaluate
+        examples: List of examples (mix of activating and non-activating)
+
+    Returns:
+        Tuple of (system_prompt, user_prompt) strings
+    """
+    system_prompt = f"""We're studying features in a neural network. Each feature activates on some particular word/words/substring/concept in a short document. You will be given a short explanation of what this feature activates for, and then be shown {len(examples)} example sequences in random order. You will have to return a boolean list of the examples where you think the feature should activate at least once, on ANY of the words or substrings in the document, true if it does, false if it doesn't. Try not to be overly specific in your interpretation of the explanation."""
+    system_prompt += """
+Your output should be a JSON object that has the following fields: `steps`, `evaluation_results`. `steps` should be an array of strings, each representing a step in the chain-of-thought process within 50 words. `evaluation_results` should be an array of booleans, each representing whether the feature should activate on the corresponding example.
+"""
+    user_prompt = f"Here is the explanation:\n\n{explanation['final_explanation']}\n\nHere are the examples:\n\n"
+
+    for i, example in enumerate(examples, 1):
+        user_prompt += f"Example {i}: {example.display_plain()}\n"
+
+    return system_prompt, user_prompt
+
+
+def generate_fuzzing_prompt(
+    cfg: AutoInterpConfig,
+    explanation: dict[str, Any],
+    examples: list[tuple[TokenizedSample, bool]],  # (sample, is_correctly_marked)
+) -> tuple[str, str]:
+    """Generate a prompt for fuzzing evaluation.
+
+    Args:
+        cfg: Auto-interpretation configuration
+        explanation: The explanation to evaluate
+        examples: List of tuples (example, is_correctly_marked)
+
+    Returns:
+        Tuple of (system_prompt, user_prompt) strings
+    """
+    system_prompt = f"""We're studying features in a neural network. Each feature activates on some particular word/words/substring/concept in a short document. You will be given a short explanation of what this feature activates for, and then be shown {len(examples)} example sequences in random order. In each example, text segments highlighted with << >> are presented as activating the feature as described in the explanation. You will have to return a boolean list of the examples where you think the highlighted parts CORRECTLY correspond to the explanation, true if they do, false if they don't. Try not to be overly specific in your interpretation of the explanation."""
+    system_prompt += """
+Your output should be a JSON object that has the following fields: `steps`, `evaluation_results`. `steps` should be an array of strings, each representing a step in the chain-of-thought process within 50 words. `evaluation_results` should be an array of booleans, each representing whether the feature should activate on the corresponding example.
+"""
+    user_prompt = f"Here is the explanation:\n\n{explanation['final_explanation']}\n\nHere are the examples:\n\n"
+
+    for i, (example, _) in enumerate(examples, 1):
+        highlighted = example.display_highlighted(cfg.activation_threshold)
+        user_prompt += f"Example {i}: {highlighted}\n"
+
+    return system_prompt, user_prompt
+