Skip to content

Commit 58bfd43

Browse files
authored
Add unicode handling to keyword match (#52)
* Add unicode handling to keyword match * Handle non-word letter endings or startings * updated tests * Ruff formatting * Address copilot nits Thank you to @yehorkardash for identifying this in our JS version
1 parent eb724bf commit 58bfd43

File tree

2 files changed

+144
-2
lines changed

2 files changed

+144
-2
lines changed

src/guardrails/checks/text/keywords.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,21 @@ def _compile_pattern(keywords: tuple[str, ...]) -> re.Pattern[str]:
7373
Returns:
7474
re.Pattern[str]: Compiled regex pattern to match any given keyword.
7575
"""
76-
escaped = (re.escape(k) for k in keywords)
77-
pattern_text = r"\b(?:" + "|".join(escaped) + r")\b"
76+
# Build individual patterns with conditional boundary assertions
77+
# Only apply (?<!\w) if keyword starts with word char, (?!\w) if it ends with word char
78+
patterns = []
79+
for keyword in keywords:
80+
escaped = re.escape(keyword)
81+
# Check first and last character of the original keyword for word character status
82+
starts_with_word_char = keyword and (keyword[0].isalnum() or keyword[0] == "_")
83+
ends_with_word_char = keyword and (keyword[-1].isalnum() or keyword[-1] == "_")
84+
85+
prefix = r"(?<!\w)" if starts_with_word_char else ""
86+
suffix = r"(?!\w)" if ends_with_word_char else ""
87+
patterns.append(f"{prefix}{escaped}{suffix}")
88+
89+
# (?<!\w) and (?!\w) use Unicode-aware lookbehind/lookahead to enforce word boundaries.
90+
pattern_text = "(?:" + "|".join(patterns) + ")"
7891

7992
return re.compile(pattern_text, re.IGNORECASE)
8093

tests/unit/checks/test_keywords.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,3 +65,132 @@ async def test_keywords_does_not_trigger_on_benign_text() -> None:
6565
result = await keywords(ctx=None, data="Safe content", config=config)
6666

6767
assert result.tripwire_triggered is False # noqa: S101
68+
69+
70+
def test_match_keywords_does_not_match_partial_words() -> None:
71+
"""Ensure substrings embedded in larger words are ignored."""
72+
config = KeywordCfg(keywords=["orld"])
73+
result = match_keywords("Hello, world!", config, guardrail_name="Test Guardrail")
74+
75+
assert result.tripwire_triggered is False # noqa: S101
76+
77+
78+
def test_match_keywords_handles_numeric_tokens() -> None:
79+
"""Keywords containing digits should match exact tokens."""
80+
config = KeywordCfg(keywords=["world123"])
81+
result = match_keywords("Hello, world123", config, guardrail_name="Test Guardrail")
82+
83+
assert result.tripwire_triggered is True # noqa: S101
84+
assert result.info["matched"] == ["world123"] # noqa: S101
85+
86+
87+
def test_match_keywords_rejects_partial_numeric_tokens() -> None:
88+
"""Numeric keywords should not match when extra digits follow."""
89+
config = KeywordCfg(keywords=["world123"])
90+
result = match_keywords("Hello, world12345", config, guardrail_name="Test Guardrail")
91+
92+
assert result.tripwire_triggered is False # noqa: S101
93+
94+
95+
def test_match_keywords_handles_underscored_tokens() -> None:
96+
"""Underscored keywords should be detected exactly once."""
97+
config = KeywordCfg(keywords=["w_o_r_l_d"])
98+
result = match_keywords("Hello, w_o_r_l_d", config, guardrail_name="Test Guardrail")
99+
100+
assert result.tripwire_triggered is True # noqa: S101
101+
assert result.info["matched"] == ["w_o_r_l_d"] # noqa: S101
102+
103+
104+
def test_match_keywords_rejects_words_embedded_in_underscores() -> None:
105+
"""Words surrounded by underscores should not trigger partial matches."""
106+
config = KeywordCfg(keywords=["world"])
107+
result = match_keywords("Hello, test_world_test", config, guardrail_name="Test Guardrail")
108+
109+
assert result.tripwire_triggered is False # noqa: S101
110+
111+
112+
def test_match_keywords_handles_chinese_characters() -> None:
113+
"""Unicode keywords such as Chinese characters should match."""
114+
config = KeywordCfg(keywords=["你好"])
115+
result = match_keywords("你好", config, guardrail_name="Test Guardrail")
116+
117+
assert result.tripwire_triggered is True # noqa: S101
118+
assert result.info["matched"] == ["你好"] # noqa: S101
119+
120+
121+
def test_match_keywords_handles_chinese_tokens_with_digits() -> None:
122+
"""Unicode keywords that include digits should match whole tokens."""
123+
config = KeywordCfg(keywords=["你好123"])
124+
result = match_keywords("你好123", config, guardrail_name="Test Guardrail")
125+
126+
assert result.tripwire_triggered is True # noqa: S101
127+
assert result.info["matched"] == ["你好123"] # noqa: S101
128+
129+
130+
def test_match_keywords_rejects_partial_chinese_tokens_with_digits() -> None:
131+
"""Unicode keywords with trailing digits should not match supersets."""
132+
config = KeywordCfg(keywords=["你好123"])
133+
result = match_keywords("你好12345", config, guardrail_name="Test Guardrail")
134+
135+
assert result.tripwire_triggered is False # noqa: S101
136+
137+
138+
def test_match_keywords_applies_boundaries_to_all_keywords() -> None:
139+
"""Every keyword in a multi-token pattern should respect Unicode boundaries."""
140+
config = KeywordCfg(keywords=["test", "hello", "world"])
141+
result = match_keywords("testing hello world", config, guardrail_name="Test Guardrail")
142+
143+
assert result.tripwire_triggered is True # noqa: S101
144+
assert result.info["matched"] == ["hello", "world"] # noqa: S101
145+
146+
147+
def test_match_keywords_detects_email_like_patterns() -> None:
148+
"""Email-like keywords starting with punctuation should match after word chars."""
149+
config = KeywordCfg(keywords=["@corp.com"])
150+
result = match_keywords("[email protected]", config, guardrail_name="Test Guardrail")
151+
152+
assert result.tripwire_triggered is True # noqa: S101
153+
assert result.info["matched"] == ["@corp.com"] # noqa: S101
154+
155+
156+
def test_match_keywords_detects_hashtag_patterns() -> None:
157+
"""Hashtag keywords starting with punctuation should match after word chars."""
158+
config = KeywordCfg(keywords=["#leak"])
159+
result = match_keywords("abc#leak", config, guardrail_name="Test Guardrail")
160+
161+
assert result.tripwire_triggered is True # noqa: S101
162+
assert result.info["matched"] == ["#leak"] # noqa: S101
163+
164+
165+
def test_match_keywords_respects_end_boundary_for_punctuation_prefixed() -> None:
166+
"""Punctuation-prefixed keywords ending with word chars need end boundary."""
167+
config = KeywordCfg(keywords=["@leak"])
168+
# Should not match when word chars continue after
169+
result = match_keywords("foo@leakmore", config, guardrail_name="Test Guardrail")
170+
assert result.tripwire_triggered is False # noqa: S101
171+
172+
# Should match when followed by non-word char
173+
result = match_keywords("foo@leak bar", config, guardrail_name="Test Guardrail")
174+
assert result.tripwire_triggered is True # noqa: S101
175+
assert result.info["matched"] == ["@leak"] # noqa: S101
176+
177+
178+
def test_match_keywords_handles_full_punctuation_keywords() -> None:
179+
"""Keywords consisting only of punctuation should match anywhere."""
180+
config = KeywordCfg(keywords=["@#$"])
181+
result = match_keywords("test@#$test", config, guardrail_name="Test Guardrail")
182+
183+
assert result.tripwire_triggered is True # noqa: S101
184+
assert result.info["matched"] == ["@#$"] # noqa: S101
185+
186+
187+
def test_match_keywords_mixed_punctuation_and_word_chars() -> None:
188+
"""Keywords with both punctuation prefix and suffix should work correctly."""
189+
config = KeywordCfg(keywords=["@user@"])
190+
# Should match when embedded
191+
result = match_keywords("test@user@test", config, guardrail_name="Test Guardrail")
192+
assert result.tripwire_triggered is True # noqa: S101
193+
194+
# Should match even when followed by more text (no boundaries applied to punctuation edges)
195+
result = match_keywords("test@user@more", config, guardrail_name="Test Guardrail")
196+
assert result.tripwire_triggered is True # noqa: S101

0 commit comments

Comments
 (0)