Skip to content

NLTK Removal #1102

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
bbef78a
removed nltk dependency
AlejandroEsquivel Sep 27, 2024
0298730
remove nltk import and download from validator base
AlejandroEsquivel Sep 27, 2024
e699397
remove commented out test referencing nltk
AlejandroEsquivel Sep 27, 2024
c1f7350
throwing import error if nltk not available in detect_pii mock
AlejandroEsquivel Sep 27, 2024
416f290
Added rule based sentence tokenization from WordTokenizers.jl with mi…
AlejandroEsquivel Sep 27, 2024
6743df2
Added new version of split_sentence_str using new rule based sentence…
AlejandroEsquivel Sep 27, 2024
acae737
updates to factor in quotes during sentence splitting
AlejandroEsquivel Sep 27, 2024
815300f
updated poetry.lock
AlejandroEsquivel Sep 27, 2024
1a2546a
replaced split sentence default
AlejandroEsquivel Sep 27, 2024
bd0ab50
testing changes using custom separator in wordtokenizer algo
AlejandroEsquivel Oct 1, 2024
4ebef18
fix for counting subs
AlejandroEsquivel Oct 1, 2024
fb1d2d0
reverted split sentence in validators base
AlejandroEsquivel Oct 1, 2024
3fb5765
reverted to pre-seperator algo, added fix for conditional white space…
AlejandroEsquivel Oct 1, 2024
f453ecc
fix for optional white space after potential line endings ?!.
AlejandroEsquivel Oct 1, 2024
88477c7
Merge branch 'main' into feat/nltk-removal
AlejandroEsquivel Oct 1, 2024
6462558
added back modified seperator algo, fix for split sentence
AlejandroEsquivel Oct 1, 2024
6f8b175
Fix regex patterns for abbreviations in tokenization_utils_seperator.py
AlejandroEsquivel Oct 1, 2024
27a0419
fix tests
AlejandroEsquivel Oct 1, 2024
49aa02d
remove nltk references from tests
AlejandroEsquivel Oct 1, 2024
affb205
removed older scripts
AlejandroEsquivel Oct 1, 2024
d3cfed9
bumped api version
AlejandroEsquivel Oct 18, 2024
97f3457
dep updates
zsimjee Oct 18, 2024
5562688
Merge branch '0.6.0-dev' into feat/nltk-removal
zsimjee Oct 19, 2024
7d7240b
fix test out of main
dtam Oct 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 206 additions & 0 deletions guardrails/utils/tokenization_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
# This file contains code adapted from the WordTokenizers.jl
# https://github.com/JuliaText/WordTokenizers.jl project.
# It is subject to the license terms in the Apache License file
# found in the top-level directory of this distribution.
# This file has been modified by Guardrails AI on September 27 2024.

import re


def replace_til_no_change(input_text, pattern, replacement):
while True:
new_text = re.sub(pattern, replacement, input_text)
if new_text == input_text:
break
input_text = new_text
return input_text


def postproc_splits(sentences, separator):
"""
Applies heuristic rules to repair sentence splitting errors.
Developed for use as postprocessing for the GENIA sentence
splitter on PubMed abstracts, with minor tweaks for
full-text documents.

`sentences` should be a string, with line breaks on sentence boundaries.
Returns a similar string, but more correct.

Based on
https://github.com/ninjin/geniass/blob/master/geniass-postproc.pl
Which is
(c) 2010 Sampo Pyysalo. No rights reserved, i.e. do whatever you like with this.
Which draws in part on heuristics included in Yoshimasa Tsuruoka's
medss.pl script.
"""

# Remove Windows line endings
sentences = sentences.replace("\r", "")

# Breaks sometimes missing after "?", "safe" cases
sentences = re.sub(
r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
)
# Breaks sometimes missing after ".", "safe" cases
sentences = re.sub(
r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences
)

# No breaks producing lines only containing sentence-ending punctuation
sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences)

# No breaks inside parentheses/brackets
sentences = replace_til_no_change(
sentences,
r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]",
r"[\1 \2]",
)
sentences = replace_til_no_change(
sentences,
r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)",
r"(\1 \2)",
)
# Standard mismatched with possible intervening
sentences = replace_til_no_change(
sentences,
r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]",
r"[\1 \2]",
)
sentences = replace_til_no_change(
sentences,
r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)",
r"(\1 \2)",
)

# Line breaks within quotes
sentences = replace_til_no_change(
sentences,
r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"',
r'"\1 \2"',
)
sentences = replace_til_no_change(
sentences,
r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'",
r"'\1 \2'",
)

# Nesting to depth one
sentences = replace_til_no_change(
sentences,
r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})"
+ re.escape(separator)
+ r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]",
r"[\1 \2]",
)
sentences = replace_til_no_change(
sentences,
r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})"
+ re.escape(separator)
+ r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)",
r"(\1 \2)",
)

# No break after periods followed by a non-uppercase "normal word"
sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences)

# No break after a single letter other than I
sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences)

# No break before coordinating conjunctions (CC)
coordinating_conjunctions = ["and", "or", "but", "nor", "yet"]
for cc in coordinating_conjunctions:
sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences)

# No break before prepositions (IN)
prepositions = [
"of",
"in",
"by",
"as",
"on",
"at",
"to",
"via",
"for",
"with",
"that",
"than",
"from",
"into",
"upon",
"after",
"while",
"during",
"within",
"through",
"between",
"whereas",
"whether",
]
for prep in prepositions:
sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences)

# No sentence breaks in the middle of specific abbreviations
sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences)
sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences)
sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences)

# No sentence break after specific abbreviations
abbreviations = [
r"e\. ?g\.",
r"i\. ?e\.",
r"i\. ?v\.",
r"vs\.",
r"cf\.",
r"Dr\.",
r"Mr\.",
r"Ms\.",
r"Mrs\.",
r"Prof\.",
r"Ph\.?D\.",
r"Jr\.",
r"St\.",
r"Mt\.",
r"etc\.",
r"Fig\.",
r"vol\.",
r"Vols\.",
r"no\.",
r"Nos\.",
r"et\.",
r"al\.",
r"i\. ?v\.",
r"inc\.",
r"Ltd\.",
r"Co\.",
r"Corp\.",
r"Dept\.",
r"est\.",
r"Asst\.",
r"approx\.",
r"dr\.",
r"fig\.",
r"mr\.",
r"mrs\.",
r"ms\.",
r"prof\.",
r"rep\.",
r"jr\.",
r"sen\.",
r"st\.",
r"vs\.",
r"i\. ?e\.",
]
for abbr in abbreviations:
sentences = re.sub(
rf"(\b{abbr}){separator}", r"\1", sentences, flags=re.IGNORECASE
)

return sentences


def split_sentences(text, separator="abcdsentenceseperatordcba"):
# Use the separator in the regex
text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text)
text = postproc_splits(text, separator)
return re.split(rf"\n?{separator} ?\n?", text)
55 changes: 39 additions & 16 deletions guardrails/validator_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,19 @@
# - [ ] Remove validator_base.py in 0.6.x

import asyncio
import contextlib
from functools import partial
import inspect
import logging
from collections import defaultdict
from dataclasses import dataclass
import re
from string import Template
from typing import Any, Callable, Dict, List, Optional, Type, TypeVar, Union
from typing_extensions import deprecated
from warnings import warn
import warnings

import nltk
import requests
from langchain_core.runnables import Runnable

Expand All @@ -31,12 +32,9 @@
from guardrails.types.on_fail import OnFailAction
from guardrails.utils.safe_get import safe_get
from guardrails.utils.hub_telemetry_utils import HubTelemetry

# See: https://github.com/guardrails-ai/guardrails/issues/829
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt")
from guardrails.utils.tokenization_utils import (
postproc_splits,
)


### functions to get chunks ###
Expand All @@ -48,21 +46,46 @@ def split_sentence_str(chunk: str):
return [fragments[0] + ".", ".".join(fragments[1:])]


def split_sentence_nltk(chunk: str):
def split_sentence_word_tokenizers_jl_separator(
chunk: str, separator: str = "abcdsentenceseperatordcba"
):
"""
NOTE: this approach currently does not work
Use a sentence tokenizer to split the chunk into sentences.
Use a sentence tokenizer to detect if at least one sentence is present in the chunk.
We return the first sentence and the remaining chunks without the first sentence.

We perform the first step of WordTokenizers.jl's split_sentences function to
detect possible sentence boundaries before calling the sentence tokenizer.

Args:
chunk (str): The text to split into sentences.

Because using the tokenizer is expensive, we only use it if there
is a period present in the chunk.
Returns:
List[str]: A list of two strings. The first string is the first sentence
in the chunk. The second string is the remaining text in the chunk.
"""
# using the sentence tokenizer is expensive
# we check for a . to avoid wastefully calling the tokenizer
if "." not in chunk:

# check at least 3 characters have been accumulated before splitting
is_minimum_length = False
with contextlib.suppress(IndexError):
chunk[2]
is_minimum_length = True

# check for potential line endings, which is what split_sentences does
chunk_with_potential_line_endings, count = re.subn(
r"([?!.])(?=\s|$)", rf"\1{separator}", chunk
)
any_potential_line_endings = count > 0
if not is_minimum_length or not any_potential_line_endings:
return []
sentences = nltk.sent_tokenize(chunk)
if len(sentences) == 0:

sentences = postproc_splits(chunk_with_potential_line_endings, separator)
sentences = re.split(rf"\n?{separator} ?\n?", sentences)
# if not more than one sentence, we haven't accumulated enough for a validation
if len(sentences) <= 1:
return []

# return the sentence
# then the remaining chunks that aren't finished accumulating
return [sentences[0], "".join(sentences[1:])]
Expand Down Expand Up @@ -266,7 +289,7 @@ def _chunking_function(self, chunk: str) -> List[str]:
Returns:
list[str]: The text chunked into some subset.
"""
return split_sentence_str(chunk)
return split_sentence_word_tokenizers_jl_separator(chunk)

def validate_stream(
self, chunk: Any, metadata: Dict[str, Any], **kwargs
Expand Down
29 changes: 2 additions & 27 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ rstr = "^3.2.2"
typing-extensions = "^4.8.0"
python-dateutil = "^2.8.2"
tiktoken = ">=0.5.1"
nltk = ">3.0, <=3.8.1"
litellm = "^1.37.14"
sqlvalidator = {version = "^0.0.20", optional = true}
sqlalchemy = {version = ">=2.0.9", optional = true}
Expand Down
9 changes: 8 additions & 1 deletion tests/integration_tests/test_assets/validators/detect_pii.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from typing import Any, Callable, Dict, List, Union
import difflib
import nltk

from guardrails.validator_base import (
FailResult,
Expand Down Expand Up @@ -65,6 +64,14 @@ def chunking_function(self, chunk: str):
Because using the tokenizer is expensive, we only use it if
there is a period present in the chunk.
"""
try:
import nltk
except ImportError:
raise ImportError(
"nltk is required for sentence splitting. Please install it using "
"`poetry add nltk`"
)

# using the sentence tokenizer is expensive
# we check for a . to avoid wastefully calling the tokenizer
if "." not in chunk:
Expand Down
Loading
Loading