Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 68 additions & 101 deletions guardrails/utils/tokenization_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,20 @@

import re

_QUESTION_SPLIT_RE = re.compile(r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b")

_DOT_SPLIT_RE = re.compile(r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b")

_DOT_NONUPPERCASE_RE_TEMPLATE = r"\.{sep}([a-z]{{3,}}[a-z-]*[ .:,])"

_SINGLE_LETTER_RE_TEMPLATE = r"(\b[A-HJ-Z]\.){sep}"

_ABBR_REGEXES = [
re.compile(rf"(\be\.){{sep}}(g\.)"),
re.compile(rf"(\bi\.){{sep}}(e\.)"),
re.compile(rf"(\bi\.){{sep}}(v\.)"),
]


def replace_til_no_change(input_text, pattern, replacement):
while True:
Expand Down Expand Up @@ -35,19 +49,16 @@ def postproc_splits(sentences, separator):
# Remove Windows line endings
sentences = sentences.replace("\r", "")

# Breaks sometimes missing after "?", "safe" cases
sentences = re.sub(
r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
)
# Breaks sometimes missing after ".", "safe" cases
sentences = re.sub(
r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
)
# "?" split, precompiled regex
sentences = _QUESTION_SPLIT_RE.sub(rf"\1{separator}\2", sentences)
# "." split, precompiled regex
sentences = _DOT_SPLIT_RE.sub(rf"\1{separator}\2", sentences)

# No breaks producing lines only containing sentence-ending punctuation
sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences)
sentences = _get_static_re(separator).sub(r"\1" + separator, sentences)

# No breaks inside parentheses/brackets
# No breaks inside parentheses/brackets (complex rules via replace_til_no_change,
# cannot be precompiled or further optimized safely due to loop and dynamic strings)
sentences = replace_til_no_change(
sentences,
r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]",
Expand All @@ -58,7 +69,6 @@ def postproc_splits(sentences, separator):
r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)",
r"(\1 \2)",
)
# Standard mismatched with possible intervening
sentences = replace_til_no_change(
sentences,
r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]",
Expand All @@ -69,8 +79,6 @@ def postproc_splits(sentences, separator):
r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)",
r"(\1 \2)",
)

# Line breaks within quotes
sentences = replace_til_no_change(
sentences,
r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"',
Expand All @@ -81,8 +89,6 @@ def postproc_splits(sentences, separator):
r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'",
r"'\1 \2'",
)

# Nesting to depth one
sentences = replace_til_no_change(
sentences,
r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
Expand All @@ -98,107 +104,68 @@ def postproc_splits(sentences, separator):
r"(\1 \2)",
)

# No break after periods followed by a non-uppercase "normal word"
sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences)

# No break after a single letter other than I
sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences)
# Compile the following regexes just once per function call for performance
dot_nonuppercase_re = re.compile(_DOT_NONUPPERCASE_RE_TEMPLATE.format(sep=re.escape(separator)))
sentences = dot_nonuppercase_re.sub(r". \1", sentences)
single_letter_re = re.compile(_SINGLE_LETTER_RE_TEMPLATE.format(sep=re.escape(separator)))
sentences = single_letter_re.sub(r"\1 ", sentences)

# No break before coordinating conjunctions (CC)
coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
for cc in coordinating_conjunctions:
sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences)
coordinating_conjunctions = ("and", "or", "but", "nor", "yet")
# Precompile CC regexes for speed
cc_regexes = [_make_wordbreak_regex(cc, separator) for cc in coordinating_conjunctions]
for cc_re in cc_regexes:
sentences = cc_re.sub(r" \1", sentences)

# No break before prepositions (IN)
prepositions = [
"of",
"in",
"by",
"as",
"on",
"at",
"to",
"via",
"for",
"with",
"that",
"than",
"from",
"into",
"upon",
"after",
"while",
"during",
"within",
"through",
"between",
"whereas",
"whether",
"of", "in", "by", "as", "on", "at", "to", "via", "for", "with", "that",
"than", "from", "into", "upon", "after", "while", "during", "within", "through",
"between", "whereas", "whether",
]
for prep in prepositions:
sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences)
# Precompile preposition regexes for speed
prep_regexes = [_make_wordbreak_regex(prep, separator) for prep in prepositions]
for prep_re in prep_regexes:
sentences = prep_re.sub(r" \1", sentences)

# No sentence breaks in the middle of specific abbreviations
sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences)
sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences)
sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences)
for abbr_re in (_ABBR_REGEXES[0].pattern.replace("{sep}", re.escape(separator)),
_ABBR_REGEXES[1].pattern.replace("{sep}", re.escape(separator)),
_ABBR_REGEXES[2].pattern.replace("{sep}", re.escape(separator))):
abbr_re_compiled = re.compile(abbr_re)
# Patterns are simple, no need for IGNORECASE
sentences = abbr_re_compiled.sub(r"\1 \2", sentences)

# No sentence break after specific abbreviations
abbreviations = [
r"e\. ?g\.",
r"i\. ?e\.",
r"i\. ?v\.",
r"vs\.",
r"cf\.",
r"Dr\.",
r"Mr\.",
r"Ms\.",
r"Mrs\.",
r"Prof\.",
r"Ph\.?D\.",
r"Jr\.",
r"St\.",
r"Mt\.",
r"etc\.",
r"Fig\.",
r"vol\.",
r"Vols\.",
r"no\.",
r"Nos\.",
r"et\.",
r"al\.",
r"i\. ?v\.",
r"inc\.",
r"Ltd\.",
r"Co\.",
r"Corp\.",
r"Dept\.",
r"est\.",
r"Asst\.",
r"approx\.",
r"dr\.",
r"fig\.",
r"mr\.",
r"mrs\.",
r"ms\.",
r"prof\.",
r"rep\.",
r"jr\.",
r"sen\.",
r"st\.",
r"vs\.",
r"i\. ?e\.",
r"e\. ?g\.", r"i\. ?e\.", r"i\. ?v\.", r"vs\.", r"cf\.", r"Dr\.", r"Mr\.", r"Ms\.", r"Mrs\.",
r"Prof\.", r"Ph\.?D\.", r"Jr\.", r"St\.", r"Mt\.", r"etc\.", r"Fig\.", r"vol\.", r"Vols\.",
r"no\.", r"Nos\.", r"et\.", r"al\.", r"i\. ?v\.", r"inc\.", r"Ltd\.", r"Co\.", r"Corp\.",
r"Dept\.", r"est\.", r"Asst\.", r"approx\.", r"dr\.", r"fig\.", r"mr\.", r"mrs\.", r"ms\.",
r"prof\.", r"rep\.", r"jr\.", r"sen\.", r"st\.", r"vs\.", r"i\. ?e\.",
]
for abbr in abbreviations:
sentences = re.sub(
rf"(\b{abbr}){separator}", r"\1", sentences, flags=re.IGNORECASE
)
# Precompile all abbreviation regexes once per call for performance,
# ~4x fewer calls to re.sub by building a single pattern
abbr_joined = r"|".join(abbreviations)
abbreviations_re = re.compile(rf"(\b(?:{abbr_joined})){re.escape(separator)}", flags=re.IGNORECASE)
sentences = abbreviations_re.sub(r"\1", sentences)

return sentences


def split_sentences(text, separator="abcdsentenceseperatordcba"):
# Use the separator in the regex
text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text)
# Use precompiled regex for sentence splitting
split_regex = re.compile(r"([?!.])(?=\s|$)")
text = split_regex.sub(rf"\1{separator}", text)
text = postproc_splits(text, separator)
return re.split(rf"\n?{separator} ?\n?", text)
# Precompile separator split only once
sep_split_regex = re.compile(rf"\n?{separator} ?\n?")
return sep_split_regex.split(text)

# The "no breaks producing lines only containing sentence-ending punctuation"
def _get_static_re(separator: str):
return re.compile(rf"{separator}([.!?]+){separator}")

# Coordinating conjunctions/prepositions regex helper
def _make_wordbreak_regex(word: str, separator: str):
return re.compile(rf"{separator}({word}\s)", re.IGNORECASE)