Add unicode handling to keyword match (#52)

steven10a · web-flow · commit 58bfd4340ca7 · 2025-11-19T10:06:02.000-08:00
* Add unicode handling to keyword match * Handle non-word letter endings or startings * updated tests * Ruff formatting * Address copilot nits Thank you to @yehorkardash for identifying this in our JS version
diff --git a/src/guardrails/checks/text/keywords.py b/src/guardrails/checks/text/keywords.py
@@ -73,8 +73,21 @@ def _compile_pattern(keywords: tuple[str, ...]) -> re.Pattern[str]:
     Returns:
         re.Pattern[str]: Compiled regex pattern to match any given keyword.
     """
-    escaped = (re.escape(k) for k in keywords)
-    pattern_text = r"\b(?:" + "|".join(escaped) + r")\b"
+    # Build individual patterns with conditional boundary assertions
+    # Only apply (?<!\w) if keyword starts with word char, (?!\w) if it ends with word char
+    patterns = []
+    for keyword in keywords:
+        escaped = re.escape(keyword)
+        # Check first and last character of the original keyword for word character status
+        starts_with_word_char = keyword and (keyword[0].isalnum() or keyword[0] == "_")
+        ends_with_word_char = keyword and (keyword[-1].isalnum() or keyword[-1] == "_")
+
+        prefix = r"(?<!\w)" if starts_with_word_char else ""
+        suffix = r"(?!\w)" if ends_with_word_char else ""
+        patterns.append(f"{prefix}{escaped}{suffix}")
+
+    # (?<!\w) and (?!\w) use Unicode-aware lookbehind/lookahead to enforce word boundaries.
+    pattern_text = "(?:" + "|".join(patterns) + ")"
 
     return re.compile(pattern_text, re.IGNORECASE)
 
diff --git a/tests/unit/checks/test_keywords.py b/tests/unit/checks/test_keywords.py
@@ -65,3 +65,132 @@ async def test_keywords_does_not_trigger_on_benign_text() -> None:
     result = await keywords(ctx=None, data="Safe content", config=config)
 
     assert result.tripwire_triggered is False  # noqa: S101
+
+
+def test_match_keywords_does_not_match_partial_words() -> None:
+    """Ensure substrings embedded in larger words are ignored."""
+    config = KeywordCfg(keywords=["orld"])
+    result = match_keywords("Hello, world!", config, guardrail_name="Test Guardrail")
+
+    assert result.tripwire_triggered is False  # noqa: S101
+
+
+def test_match_keywords_handles_numeric_tokens() -> None:
+    """Keywords containing digits should match exact tokens."""
+    config = KeywordCfg(keywords=["world123"])
+    result = match_keywords("Hello, world123", config, guardrail_name="Test Guardrail")
+
+    assert result.tripwire_triggered is True  # noqa: S101
+    assert result.info["matched"] == ["world123"]  # noqa: S101
+
+
+def test_match_keywords_rejects_partial_numeric_tokens() -> None:
+    """Numeric keywords should not match when extra digits follow."""
+    config = KeywordCfg(keywords=["world123"])
+    result = match_keywords("Hello, world12345", config, guardrail_name="Test Guardrail")
+
+    assert result.tripwire_triggered is False  # noqa: S101
+
+
+def test_match_keywords_handles_underscored_tokens() -> None:
+    """Underscored keywords should be detected exactly once."""
+    config = KeywordCfg(keywords=["w_o_r_l_d"])
+    result = match_keywords("Hello, w_o_r_l_d", config, guardrail_name="Test Guardrail")
+
+    assert result.tripwire_triggered is True  # noqa: S101
+    assert result.info["matched"] == ["w_o_r_l_d"]  # noqa: S101
+
+
+def test_match_keywords_rejects_words_embedded_in_underscores() -> None:
+    """Words surrounded by underscores should not trigger partial matches."""
+    config = KeywordCfg(keywords=["world"])
+    result = match_keywords("Hello, test_world_test", config, guardrail_name="Test Guardrail")
+
+    assert result.tripwire_triggered is False  # noqa: S101
+
+
+def test_match_keywords_handles_chinese_characters() -> None:
+    """Unicode keywords such as Chinese characters should match."""
+    config = KeywordCfg(keywords=["你好"])
+    result = match_keywords("你好", config, guardrail_name="Test Guardrail")
+
+    assert result.tripwire_triggered is True  # noqa: S101
+    assert result.info["matched"] == ["你好"]  # noqa: S101
+
+
+def test_match_keywords_handles_chinese_tokens_with_digits() -> None:
+    """Unicode keywords that include digits should match whole tokens."""
+    config = KeywordCfg(keywords=["你好123"])
+    result = match_keywords("你好123", config, guardrail_name="Test Guardrail")
+
+    assert result.tripwire_triggered is True  # noqa: S101
+    assert result.info["matched"] == ["你好123"]  # noqa: S101
+
+
+def test_match_keywords_rejects_partial_chinese_tokens_with_digits() -> None:
+    """Unicode keywords with trailing digits should not match supersets."""
+    config = KeywordCfg(keywords=["你好123"])
+    result = match_keywords("你好12345", config, guardrail_name="Test Guardrail")
+
+    assert result.tripwire_triggered is False  # noqa: S101
+
+
+def test_match_keywords_applies_boundaries_to_all_keywords() -> None:
+    """Every keyword in a multi-token pattern should respect Unicode boundaries."""
+    config = KeywordCfg(keywords=["test", "hello", "world"])
+    result = match_keywords("testing hello world", config, guardrail_name="Test Guardrail")
+
+    assert result.tripwire_triggered is True  # noqa: S101
+    assert result.info["matched"] == ["hello", "world"]  # noqa: S101
+
+
+def test_match_keywords_detects_email_like_patterns() -> None:
+    """Email-like keywords starting with punctuation should match after word chars."""
+    config = KeywordCfg(keywords=["@corp.com"])
+    result = match_keywords("foo@corp.com", config, guardrail_name="Test Guardrail")
+
+    assert result.tripwire_triggered is True  # noqa: S101
+    assert result.info["matched"] == ["@corp.com"]  # noqa: S101
+
+
+def test_match_keywords_detects_hashtag_patterns() -> None:
+    """Hashtag keywords starting with punctuation should match after word chars."""
+    config = KeywordCfg(keywords=["#leak"])
+    result = match_keywords("abc#leak", config, guardrail_name="Test Guardrail")
+
+    assert result.tripwire_triggered is True  # noqa: S101
+    assert result.info["matched"] == ["#leak"]  # noqa: S101
+
+
+def test_match_keywords_respects_end_boundary_for_punctuation_prefixed() -> None:
+    """Punctuation-prefixed keywords ending with word chars need end boundary."""
+    config = KeywordCfg(keywords=["@leak"])
+    # Should not match when word chars continue after
+    result = match_keywords("foo@leakmore", config, guardrail_name="Test Guardrail")
+    assert result.tripwire_triggered is False  # noqa: S101
+
+    # Should match when followed by non-word char
+    result = match_keywords("foo@leak bar", config, guardrail_name="Test Guardrail")
+    assert result.tripwire_triggered is True  # noqa: S101
+    assert result.info["matched"] == ["@leak"]  # noqa: S101
+
+
+def test_match_keywords_handles_full_punctuation_keywords() -> None:
+    """Keywords consisting only of punctuation should match anywhere."""
+    config = KeywordCfg(keywords=["@#$"])
+    result = match_keywords("test@#$test", config, guardrail_name="Test Guardrail")
+
+    assert result.tripwire_triggered is True  # noqa: S101
+    assert result.info["matched"] == ["@#$"]  # noqa: S101
+
+
+def test_match_keywords_mixed_punctuation_and_word_chars() -> None:
+    """Keywords with both punctuation prefix and suffix should work correctly."""
+    config = KeywordCfg(keywords=["@user@"])
+    # Should match when embedded
+    result = match_keywords("test@user@test", config, guardrail_name="Test Guardrail")
+    assert result.tripwire_triggered is True  # noqa: S101
+
+    # Should match even when followed by more text (no boundaries applied to punctuation edges)
+    result = match_keywords("test@user@more", config, guardrail_name="Test Guardrail")
+    assert result.tripwire_triggered is True  # noqa: S101