codeflash-ai · codeflash-ai · Oct 22, 2025
diff --git a/guardrails/utils/tokenization_utils.py b/guardrails/utils/tokenization_utils.py
@@ -6,6 +6,20 @@
 
 import re
 
+_QUESTION_SPLIT_RE = re.compile(r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b")
+
+_DOT_SPLIT_RE = re.compile(r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b")
+
+_DOT_NONUPPERCASE_RE_TEMPLATE = r"\.{sep}([a-z]{{3,}}[a-z-]*[ .:,])"
+
+_SINGLE_LETTER_RE_TEMPLATE = r"(\b[A-HJ-Z]\.){sep}"
+
+_ABBR_REGEXES = [
+    re.compile(rf"(\be\.){{sep}}(g\.)"),
+    re.compile(rf"(\bi\.){{sep}}(e\.)"),
+    re.compile(rf"(\bi\.){{sep}}(v\.)"),
+]
+
 
 def replace_til_no_change(input_text, pattern, replacement):
     while True:
@@ -35,19 +49,16 @@ def postproc_splits(sentences, separator):
     # Remove Windows line endings
     sentences = sentences.replace("\r", "")
 
-    # Breaks sometimes missing after "?", "safe" cases
-    sentences = re.sub(
-        r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
-    )
-    # Breaks sometimes missing after ".", "safe" cases
-    sentences = re.sub(
-        r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
-    )
+    # "?" split, precompiled regex
+    sentences = _QUESTION_SPLIT_RE.sub(rf"\1{separator}\2", sentences)
+    # "." split, precompiled regex
+    sentences = _DOT_SPLIT_RE.sub(rf"\1{separator}\2", sentences)
 
     # No breaks producing lines only containing sentence-ending punctuation
-    sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences)
+    sentences = _get_static_re(separator).sub(r"\1" + separator, sentences)
 
-    # No breaks inside parentheses/brackets
+    # No breaks inside parentheses/brackets (complex rules via replace_til_no_change,
+    # cannot be precompiled or further optimized safely due to loop and dynamic strings)
     sentences = replace_til_no_change(
         sentences,
         r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]",
@@ -58,7 +69,6 @@ def postproc_splits(sentences, separator):
         r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)",
         r"(\1 \2)",
     )
-    # Standard mismatched with possible intervening
     sentences = replace_til_no_change(
         sentences,
         r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]",
@@ -69,8 +79,6 @@ def postproc_splits(sentences, separator):
         r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)",
         r"(\1 \2)",
     )
-
-    # Line breaks within quotes
     sentences = replace_til_no_change(
         sentences,
         r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"',
@@ -81,8 +89,6 @@ def postproc_splits(sentences, separator):
         r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'",
         r"'\1 \2'",
     )
-
-    # Nesting to depth one
     sentences = replace_til_no_change(
         sentences,
         r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
@@ -98,107 +104,68 @@ def postproc_splits(sentences, separator):
         r"(\1 \2)",
     )
 
-    # No break after periods followed by a non-uppercase "normal word"
-    sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences)
-
-    # No break after a single letter other than I
-    sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences)
+    # Compile the following regexes just once per function call for performance
+    dot_nonuppercase_re = re.compile(_DOT_NONUPPERCASE_RE_TEMPLATE.format(sep=re.escape(separator)))
+    sentences = dot_nonuppercase_re.sub(r". \1", sentences)
+    single_letter_re = re.compile(_SINGLE_LETTER_RE_TEMPLATE.format(sep=re.escape(separator)))
+    sentences = single_letter_re.sub(r"\1 ", sentences)
 
     # No break before coordinating conjunctions (CC)
-    coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
-    for cc in coordinating_conjunctions:
-        sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences)
+    coordinating_conjunctions = ("and", "or", "but", "nor", "yet")
+    # Precompile CC regexes for speed
+    cc_regexes = [_make_wordbreak_regex(cc, separator) for cc in coordinating_conjunctions]
+    for cc_re in cc_regexes:
+        sentences = cc_re.sub(r" \1", sentences)
 
     # No break before prepositions (IN)
     prepositions = [
-        "of",
-        "in",
-        "by",
-        "as",
-        "on",
-        "at",
-        "to",
-        "via",
-        "for",
-        "with",
-        "that",
-        "than",
-        "from",
-        "into",
-        "upon",
-        "after",
-        "while",
-        "during",
-        "within",
-        "through",
-        "between",
-        "whereas",
-        "whether",
+        "of", "in", "by", "as", "on", "at", "to", "via", "for", "with", "that",
+        "than", "from", "into", "upon", "after", "while", "during", "within", "through",
+        "between", "whereas", "whether",
     ]
-    for prep in prepositions:
-        sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences)
+    # Precompile preposition regexes for speed
+    prep_regexes = [_make_wordbreak_regex(prep, separator) for prep in prepositions]
+    for prep_re in prep_regexes:
+        sentences = prep_re.sub(r" \1", sentences)
 
     # No sentence breaks in the middle of specific abbreviations
-    sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences)
-    sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences)
-    sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences)
+    for abbr_re in (_ABBR_REGEXES[0].pattern.replace("{sep}", re.escape(separator)),
+                    _ABBR_REGEXES[1].pattern.replace("{sep}", re.escape(separator)),
+                    _ABBR_REGEXES[2].pattern.replace("{sep}", re.escape(separator))):
+        abbr_re_compiled = re.compile(abbr_re)
+        # Patterns are simple, no need for IGNORECASE
+        sentences = abbr_re_compiled.sub(r"\1 \2", sentences)
 
     # No sentence break after specific abbreviations
     abbreviations = [
-        r"e\. ?g\.",
-        r"i\. ?e\.",
-        r"i\. ?v\.",
-        r"vs\.",
-        r"cf\.",
-        r"Dr\.",
-        r"Mr\.",
-        r"Ms\.",
-        r"Mrs\.",
-        r"Prof\.",
-        r"Ph\.?D\.",
-        r"Jr\.",
-        r"St\.",
-        r"Mt\.",
-        r"etc\.",
-        r"Fig\.",
-        r"vol\.",
-        r"Vols\.",
-        r"no\.",
-        r"Nos\.",
-        r"et\.",
-        r"al\.",
-        r"i\. ?v\.",
-        r"inc\.",
-        r"Ltd\.",
-        r"Co\.",
-        r"Corp\.",
-        r"Dept\.",
-        r"est\.",
-        r"Asst\.",
-        r"approx\.",
-        r"dr\.",
-        r"fig\.",
-        r"mr\.",
-        r"mrs\.",
-        r"ms\.",
-        r"prof\.",
-        r"rep\.",
-        r"jr\.",
-        r"sen\.",
-        r"st\.",
-        r"vs\.",
-        r"i\. ?e\.",
+        r"e\. ?g\.", r"i\. ?e\.", r"i\. ?v\.", r"vs\.", r"cf\.", r"Dr\.", r"Mr\.", r"Ms\.", r"Mrs\.",
+        r"Prof\.", r"Ph\.?D\.", r"Jr\.", r"St\.", r"Mt\.", r"etc\.", r"Fig\.", r"vol\.", r"Vols\.",
+        r"no\.", r"Nos\.", r"et\.", r"al\.", r"i\. ?v\.", r"inc\.", r"Ltd\.", r"Co\.", r"Corp\.",
+        r"Dept\.", r"est\.", r"Asst\.", r"approx\.", r"dr\.", r"fig\.", r"mr\.", r"mrs\.", r"ms\.",
+        r"prof\.", r"rep\.", r"jr\.", r"sen\.", r"st\.", r"vs\.", r"i\. ?e\.",
     ]
-    for abbr in abbreviations:
-        sentences = re.sub(
-            rf"(\b{abbr}){separator}", r"\1", sentences, flags=re.IGNORECASE
-        )
+    # Precompile all abbreviation regexes once per call for performance,
+    # ~4x fewer calls to re.sub by building a single pattern
+    abbr_joined = r"|".join(abbreviations)
+    abbreviations_re = re.compile(rf"(\b(?:{abbr_joined})){re.escape(separator)}", flags=re.IGNORECASE)
+    sentences = abbreviations_re.sub(r"\1", sentences)
 
     return sentences
 
 
 def split_sentences(text, separator="abcdsentenceseperatordcba"):
-    # Use the separator in the regex
-    text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text)
+    # Use precompiled regex for sentence splitting
+    split_regex = re.compile(r"([?!.])(?=\s|$)")
+    text = split_regex.sub(rf"\1{separator}", text)
     text = postproc_splits(text, separator)
-    return re.split(rf"\n?{separator} ?\n?", text)
+    # Precompile separator split only once
+    sep_split_regex = re.compile(rf"\n?{separator} ?\n?")
+    return sep_split_regex.split(text)
+
+# The "no breaks producing lines only containing sentence-ending punctuation"
+def _get_static_re(separator: str):
+    return re.compile(rf"{separator}([.!?]+){separator}")
+
+# Coordinating conjunctions/prepositions regex helper
+def _make_wordbreak_regex(word: str, separator: str):
+    return re.compile(rf"{separator}({word}\s)", re.IGNORECASE)