guardrails-ai · dtam · Oct 21, 2024 · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/guardrails/utils/tokenization_utils.py b/guardrails/utils/tokenization_utils.py
@@ -0,0 +1,206 @@
+# This file contains code adapted from the WordTokenizers.jl
+# https://github.com/JuliaText/WordTokenizers.jl project.
+# It is subject to the license terms in the Apache License file
+# found in the top-level directory of this distribution.
+# This file has been modified by Guardrails AI on September 27 2024.
+
+import re
+
+
+def replace_til_no_change(input_text, pattern, replacement):
+    while True:
+        new_text = re.sub(pattern, replacement, input_text)
+        if new_text == input_text:
+            break
+        input_text = new_text
+    return input_text
+
+
+def postproc_splits(sentences, separator):
+    """
+    Applies heuristic rules to repair sentence splitting errors.
+    Developed for use as postprocessing for the GENIA sentence
+    splitter on PubMed abstracts, with minor tweaks for
+    full-text documents.
+
+    `sentences` should be a string, with line breaks on sentence boundaries.
+    Returns a similar string, but more correct.
+
+    Based on
+    https://github.com/ninjin/geniass/blob/master/geniass-postproc.pl
+    Which is
+    (c) 2010 Sampo Pyysalo. No rights reserved, i.e. do whatever you like with this.
+    Which draws in part on heuristics included in Yoshimasa Tsuruoka's
+    medss.pl script.
+    """
+
+    # Remove Windows line endings
+    sentences = sentences.replace("\r", "")
+
+    # Breaks sometimes missing after "?", "safe" cases
+    sentences = re.sub(
+        r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
+    )
+    # Breaks sometimes missing after ".", "safe" cases
+    sentences = re.sub(
+        r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
+    )
+
+    # No breaks producing lines only containing sentence-ending punctuation
+    sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences)
+
+    # No breaks inside parentheses/brackets
+    sentences = replace_til_no_change(
+        sentences,
+        r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]",
+        r"[\1 \2]",
+    )
+    sentences = replace_til_no_change(
+        sentences,
+        r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)",
+        r"(\1 \2)",
+    )
+    # Standard mismatched with possible intervening
+    sentences = replace_til_no_change(
+        sentences,
+        r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]",
+        r"[\1 \2]",
+    )
+    sentences = replace_til_no_change(
+        sentences,
+        r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)",
+        r"(\1 \2)",
+    )
+
+    # Line breaks within quotes
+    sentences = replace_til_no_change(
+        sentences,
+        r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"',
+        r'"\1 \2"',
+    )
+    sentences = replace_til_no_change(
+        sentences,
+        r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'",
+        r"'\1 \2'",
+    )
+
+    # Nesting to depth one
+    sentences = replace_til_no_change(
+        sentences,
+        r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
+        + re.escape(separator)
+        + r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
+        r"[\1 \2]",
+    )
+    sentences = replace_til_no_change(
+        sentences,
+        r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})"
+        + re.escape(separator)
+        + r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
+        r"(\1 \2)",
+    )
+
+    # No break after periods followed by a non-uppercase "normal word"
+    sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences)
+
+    # No break after a single letter other than I
+    sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences)
+
+    # No break before coordinating conjunctions (CC)
+    coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
+    for cc in coordinating_conjunctions:
+        sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences)
+
+    # No break before prepositions (IN)
+    prepositions = [
+        "of",
+        "in",
+        "by",
+        "as",
+        "on",
+        "at",
+        "to",
+        "via",
+        "for",
+        "with",
+        "that",
+        "than",
+        "from",
+        "into",
+        "upon",
+        "after",
+        "while",
+        "during",
+        "within",
+        "through",
+        "between",
+        "whereas",
+        "whether",
+    ]
+    for prep in prepositions:
+        sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences)
+
+    # No sentence breaks in the middle of specific abbreviations
+    sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences)
+    sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences)
+    sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences)
+
+    # No sentence break after specific abbreviations
+    abbreviations = [
+        r"e\. ?g\.",
+        r"i\. ?e\.",
+        r"i\. ?v\.",
+        r"vs\.",
+        r"cf\.",
+        r"Dr\.",
+        r"Mr\.",
+        r"Ms\.",
+        r"Mrs\.",
+        r"Prof\.",
+        r"Ph\.?D\.",
+        r"Jr\.",
+        r"St\.",
+        r"Mt\.",
+        r"etc\.",
+        r"Fig\.",
+        r"vol\.",
+        r"Vols\.",
+        r"no\.",
+        r"Nos\.",
+        r"et\.",
+        r"al\.",
+        r"i\. ?v\.",
+        r"inc\.",
+        r"Ltd\.",
+        r"Co\.",
+        r"Corp\.",
+        r"Dept\.",
+        r"est\.",
+        r"Asst\.",
+        r"approx\.",
+        r"dr\.",
+        r"fig\.",
+        r"mr\.",
+        r"mrs\.",
+        r"ms\.",
+        r"prof\.",
+        r"rep\.",
+        r"jr\.",
+        r"sen\.",
+        r"st\.",
+        r"vs\.",
+        r"i\. ?e\.",
+    ]
+    for abbr in abbreviations:
+        sentences = re.sub(
+            rf"(\b{abbr}){separator}", r"\1", sentences, flags=re.IGNORECASE
+        )
+
+    return sentences
+
+
+def split_sentences(text, separator="abcdsentenceseperatordcba"):
+    # Use the separator in the regex
+    text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text)
+    text = postproc_splits(text, separator)
+    return re.split(rf"\n?{separator} ?\n?", text)
diff --git a/guardrails/validator_base.py b/guardrails/validator_base.py
@@ -4,18 +4,19 @@
 #   - [ ] Remove validator_base.py in 0.6.x
 
 import asyncio
+import contextlib
 from functools import partial
 import inspect
 import logging
 from collections import defaultdict
 from dataclasses import dataclass
+import re
 from string import Template
 from typing import Any, Callable, Dict, List, Optional, Type, TypeVar, Union
 from typing_extensions import deprecated
 from warnings import warn
 import warnings
 
-import nltk
 import requests
 from langchain_core.runnables import Runnable
 
@@ -31,12 +32,9 @@
 from guardrails.types.on_fail import OnFailAction
 from guardrails.utils.safe_get import safe_get
 from guardrails.utils.hub_telemetry_utils import HubTelemetry
-
-#   See: https://github.com/guardrails-ai/guardrails/issues/829
-try:
-    nltk.data.find("tokenizers/punkt")
-except LookupError:
-    nltk.download("punkt")
+from guardrails.utils.tokenization_utils import (
+    postproc_splits,
+)
 
 
 ### functions to get chunks ###
@@ -48,21 +46,46 @@ def split_sentence_str(chunk: str):
     return [fragments[0] + ".", ".".join(fragments[1:])]
 
 
-def split_sentence_nltk(chunk: str):
+def split_sentence_word_tokenizers_jl_separator(
+    chunk: str, separator: str = "abcdsentenceseperatordcba"
+):
     """
-    NOTE: this approach currently does not work
-    Use a sentence tokenizer to split the chunk into sentences.
+    Use a sentence tokenizer to detect if at least one sentence is present in the chunk.
+    We return the first sentence and the remaining chunks without the first sentence.
+
+    We perform the first step of WordTokenizers.jl's split_sentences function to
+    detect possible sentence boundaries before calling the sentence tokenizer.
+
+    Args:
+        chunk (str): The text to split into sentences.
 
-    Because using the tokenizer is expensive, we only use it if there
-    is a period present in the chunk.
+    Returns:
+        List[str]: A list of two strings. The first string is the first sentence
+            in the chunk. The second string is the remaining text in the chunk.
     """
     # using the sentence tokenizer is expensive
     # we check for a . to avoid wastefully calling the tokenizer
-    if "." not in chunk:
+
+    # check at least 3 characters have been accumulated before splitting
+    is_minimum_length = False
+    with contextlib.suppress(IndexError):
+        chunk[2]
+        is_minimum_length = True
+
+    # check for potential line endings, which is what split_sentences does
+    chunk_with_potential_line_endings, count = re.subn(
+        r"([?!.])(?=\s|$)", rf"\1{separator}", chunk
+    )
+    any_potential_line_endings = count > 0
+    if not is_minimum_length or not any_potential_line_endings:
         return []
-    sentences = nltk.sent_tokenize(chunk)
-    if len(sentences) == 0:
+
+    sentences = postproc_splits(chunk_with_potential_line_endings, separator)
+    sentences = re.split(rf"\n?{separator} ?\n?", sentences)
+    # if not more than one sentence, we haven't accumulated enough for a validation
+    if len(sentences) <= 1:
         return []
+
     # return the sentence
     # then the remaining chunks that aren't finished accumulating
     return [sentences[0], "".join(sentences[1:])]
@@ -266,7 +289,7 @@ def _chunking_function(self, chunk: str) -> List[str]:
         Returns:
             list[str]: The text chunked into some subset.
         """
-        return split_sentence_str(chunk)
+        return split_sentence_word_tokenizers_jl_separator(chunk)
 
     def validate_stream(
         self, chunk: Any, metadata: Dict[str, Any], **kwargs

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,7 +28,6 @@ rstr = "^3.2.2"
 typing-extensions = "^4.8.0"
 python-dateutil = "^2.8.2"
 tiktoken = ">=0.5.1"
-nltk = ">3.0, <=3.8.1"
 litellm = "^1.37.14"
 sqlvalidator = {version = "^0.0.20", optional = true}
 sqlalchemy = {version = ">=2.0.9", optional = true}

diff --git a/tests/integration_tests/test_assets/validators/detect_pii.py b/tests/integration_tests/test_assets/validators/detect_pii.py
@@ -1,6 +1,5 @@
 from typing import Any, Callable, Dict, List, Union
 import difflib
-import nltk
 
 from guardrails.validator_base import (
     FailResult,
@@ -65,6 +64,14 @@ def chunking_function(self, chunk: str):
         Because using the tokenizer is expensive, we only use it if
         there is a period present in the chunk.
         """
+        try:
+            import nltk
+        except ImportError:
+            raise ImportError(
+                "nltk is required for sentence splitting. Please install it using "
+                "`poetry add nltk`"
+            )
+
         # using the sentence tokenizer is expensive
         # we check for a . to avoid wastefully calling the tokenizer
         if "." not in chunk: