diff --git a/guardrails/utils/tokenization_utils.py b/guardrails/utils/tokenization_utils.py new file mode 100644 index 000000000..6dab87cdd --- /dev/null +++ b/guardrails/utils/tokenization_utils.py @@ -0,0 +1,206 @@ +# This file contains code adapted from the WordTokenizers.jl +# https://github.com/JuliaText/WordTokenizers.jl project. +# It is subject to the license terms in the Apache License file +# found in the top-level directory of this distribution. +# This file has been modified by Guardrails AI on September 27 2024. + +import re + + +def replace_til_no_change(input_text, pattern, replacement): + while True: + new_text = re.sub(pattern, replacement, input_text) + if new_text == input_text: + break + input_text = new_text + return input_text + + +def postproc_splits(sentences, separator): + """ + Applies heuristic rules to repair sentence splitting errors. + Developed for use as postprocessing for the GENIA sentence + splitter on PubMed abstracts, with minor tweaks for + full-text documents. + + `sentences` should be a string, with line breaks on sentence boundaries. + Returns a similar string, but more correct. + + Based on + https://github.com/ninjin/geniass/blob/master/geniass-postproc.pl + Which is + (c) 2010 Sampo Pyysalo. No rights reserved, i.e. do whatever you like with this. + Which draws in part on heuristics included in Yoshimasa Tsuruoka's + medss.pl script. + """ + + # Remove Windows line endings + sentences = sentences.replace("\r", "") + + # Breaks sometimes missing after "?", "safe" cases + sentences = re.sub( + r"\b([a-z]+\?)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences + ) + # Breaks sometimes missing after ".", "safe" cases + sentences = re.sub( + r"\b([a-z]+ \.)\s+([A-Z][a-z]+)\b", rf"\1{separator}\2", sentences + ) + + # No breaks producing lines only containing sentence-ending punctuation + sentences = re.sub(rf"{separator}([.!?]+){separator}", r"\1" + separator, sentences) + + # No breaks inside parentheses/brackets + sentences = replace_til_no_change( + sentences, + r"\[([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\]", + r"[\1 \2]", + ) + sentences = replace_til_no_change( + sentences, + r"\(([^\[\]\(\)]*)" + re.escape(separator) + r"([^\[\]\(\)]*)\)", + r"(\1 \2)", + ) + # Standard mismatched with possible intervening + sentences = replace_til_no_change( + sentences, + r"\[([^\[\]]{0,250})" + re.escape(separator) + r"([^\[\]]{0,250})\]", + r"[\1 \2]", + ) + sentences = replace_til_no_change( + sentences, + r"\(([^\(\)]{0,250})" + re.escape(separator) + r"([^\(\)]{0,250})\)", + r"(\1 \2)", + ) + + # Line breaks within quotes + sentences = replace_til_no_change( + sentences, + r'"([^"\n]{0,250})' + re.escape(separator) + r'([^"\n]{0,250})"', + r'"\1 \2"', + ) + sentences = replace_til_no_change( + sentences, + r"'([^'\n]{0,250})" + re.escape(separator) + r"([^'\n]{0,250})'", + r"'\1 \2'", + ) + + # Nesting to depth one + sentences = replace_til_no_change( + sentences, + r"\[((?:[^\[\]]|\[[^\[\]]*\]){0,250})" + + re.escape(separator) + + r"((?:[^\[\]]|\[[^\[\]]*\]){0,250})\]", + r"[\1 \2]", + ) + sentences = replace_til_no_change( + sentences, + r"\(((?:[^\(\)]|\([^\(\)]*\)){0,250})" + + re.escape(separator) + + r"((?:[^\(\)]|\([^\(\)]*\)){0,250})\)", + r"(\1 \2)", + ) + + # No break after periods followed by a non-uppercase "normal word" + sentences = re.sub(rf"\.{separator}([a-z]{{3,}}[a-z-]*[ .:,])", r". \1", sentences) + + # No break after a single letter other than I + sentences = re.sub(rf"(\b[A-HJ-Z]\.){separator}", r"\1 ", sentences) + + # No break before coordinating conjunctions (CC) + coordinating_conjunctions = ["and", "or", "but", "nor", "yet"] + for cc in coordinating_conjunctions: + sentences = re.sub(rf"{separator}({cc}\s)", r" \1", sentences) + + # No break before prepositions (IN) + prepositions = [ + "of", + "in", + "by", + "as", + "on", + "at", + "to", + "via", + "for", + "with", + "that", + "than", + "from", + "into", + "upon", + "after", + "while", + "during", + "within", + "through", + "between", + "whereas", + "whether", + ] + for prep in prepositions: + sentences = re.sub(rf"{separator}({prep}\s)", r" \1", sentences) + + # No sentence breaks in the middle of specific abbreviations + sentences = re.sub(rf"(\be\.){separator}(g\.)", r"\1 \2", sentences) + sentences = re.sub(rf"(\bi\.){separator}(e\.)", r"\1 \2", sentences) + sentences = re.sub(rf"(\bi\.){separator}(v\.)", r"\1 \2", sentences) + + # No sentence break after specific abbreviations + abbreviations = [ + r"e\. ?g\.", + r"i\. ?e\.", + r"i\. ?v\.", + r"vs\.", + r"cf\.", + r"Dr\.", + r"Mr\.", + r"Ms\.", + r"Mrs\.", + r"Prof\.", + r"Ph\.?D\.", + r"Jr\.", + r"St\.", + r"Mt\.", + r"etc\.", + r"Fig\.", + r"vol\.", + r"Vols\.", + r"no\.", + r"Nos\.", + r"et\.", + r"al\.", + r"i\. ?v\.", + r"inc\.", + r"Ltd\.", + r"Co\.", + r"Corp\.", + r"Dept\.", + r"est\.", + r"Asst\.", + r"approx\.", + r"dr\.", + r"fig\.", + r"mr\.", + r"mrs\.", + r"ms\.", + r"prof\.", + r"rep\.", + r"jr\.", + r"sen\.", + r"st\.", + r"vs\.", + r"i\. ?e\.", + ] + for abbr in abbreviations: + sentences = re.sub( + rf"(\b{abbr}){separator}", r"\1", sentences, flags=re.IGNORECASE + ) + + return sentences + + +def split_sentences(text, separator="abcdsentenceseperatordcba"): + # Use the separator in the regex + text = re.sub(r"([?!.])(?=\s|$)", rf"\1{separator}", text) + text = postproc_splits(text, separator) + return re.split(rf"\n?{separator} ?\n?", text) diff --git a/guardrails/validator_base.py b/guardrails/validator_base.py index d2c8e53db..ba86d2989 100644 --- a/guardrails/validator_base.py +++ b/guardrails/validator_base.py @@ -4,18 +4,19 @@ # - [ ] Remove validator_base.py in 0.6.x import asyncio +import contextlib from functools import partial import inspect import logging from collections import defaultdict from dataclasses import dataclass +import re from string import Template from typing import Any, Callable, Dict, List, Optional, Type, TypeVar, Union from typing_extensions import deprecated from warnings import warn import warnings -import nltk import requests from langchain_core.runnables import Runnable @@ -31,12 +32,9 @@ from guardrails.types.on_fail import OnFailAction from guardrails.utils.safe_get import safe_get from guardrails.utils.hub_telemetry_utils import HubTelemetry - -# See: https://github.com/guardrails-ai/guardrails/issues/829 -try: - nltk.data.find("tokenizers/punkt") -except LookupError: - nltk.download("punkt") +from guardrails.utils.tokenization_utils import ( + postproc_splits, +) ### functions to get chunks ### @@ -48,21 +46,46 @@ def split_sentence_str(chunk: str): return [fragments[0] + ".", ".".join(fragments[1:])] -def split_sentence_nltk(chunk: str): +def split_sentence_word_tokenizers_jl_separator( + chunk: str, separator: str = "abcdsentenceseperatordcba" +): """ - NOTE: this approach currently does not work - Use a sentence tokenizer to split the chunk into sentences. + Use a sentence tokenizer to detect if at least one sentence is present in the chunk. + We return the first sentence and the remaining chunks without the first sentence. + + We perform the first step of WordTokenizers.jl's split_sentences function to + detect possible sentence boundaries before calling the sentence tokenizer. + + Args: + chunk (str): The text to split into sentences. - Because using the tokenizer is expensive, we only use it if there - is a period present in the chunk. + Returns: + List[str]: A list of two strings. The first string is the first sentence + in the chunk. The second string is the remaining text in the chunk. """ # using the sentence tokenizer is expensive # we check for a . to avoid wastefully calling the tokenizer - if "." not in chunk: + + # check at least 3 characters have been accumulated before splitting + is_minimum_length = False + with contextlib.suppress(IndexError): + chunk[2] + is_minimum_length = True + + # check for potential line endings, which is what split_sentences does + chunk_with_potential_line_endings, count = re.subn( + r"([?!.])(?=\s|$)", rf"\1{separator}", chunk + ) + any_potential_line_endings = count > 0 + if not is_minimum_length or not any_potential_line_endings: return [] - sentences = nltk.sent_tokenize(chunk) - if len(sentences) == 0: + + sentences = postproc_splits(chunk_with_potential_line_endings, separator) + sentences = re.split(rf"\n?{separator} ?\n?", sentences) + # if not more than one sentence, we haven't accumulated enough for a validation + if len(sentences) <= 1: return [] + # return the sentence # then the remaining chunks that aren't finished accumulating return [sentences[0], "".join(sentences[1:])] @@ -266,7 +289,7 @@ def _chunking_function(self, chunk: str) -> List[str]: Returns: list[str]: The text chunked into some subset. """ - return split_sentence_str(chunk) + return split_sentence_word_tokenizers_jl_separator(chunk) def validate_stream( self, chunk: Any, metadata: Dict[str, Any], **kwargs diff --git a/poetry.lock b/poetry.lock index f2dabcbfb..eafa419f2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,21 @@ # This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +[[package]] +name = "aiocache" +version = "0.12.3" +description = "multi backend asyncio cache" +optional = false +python-versions = "*" +files = [ + {file = "aiocache-0.12.3-py2.py3-none-any.whl", hash = "sha256:889086fc24710f431937b87ad3720a289f7fc31c4fd8b68e9f918b9bacd8270d"}, + {file = "aiocache-0.12.3.tar.gz", hash = "sha256:f528b27bf4d436b497a1d0d1a8f59a542c153ab1e37c3621713cb376d44c4713"}, +] + +[package.extras] +memcached = ["aiomcache (>=0.5.2)"] +msgpack = ["msgpack (>=0.5.5)"] +redis = ["redis (>=4.2.0)"] + [[package]] name = "aiohappyeyeballs" version = "2.4.3" @@ -307,6 +323,23 @@ types-python-dateutil = ">=2.8.10" doc = ["doc8", "sphinx (>=7.0.0)", "sphinx-autobuild", "sphinx-autodoc-typehints", "sphinx_rtd_theme (>=1.3.0)"] test = ["dateparser (==1.*)", "pre-commit", "pytest", "pytest-cov", "pytest-mock", "pytz (==2021.1)", "simplejson (==3.*)"] +[[package]] +name = "asgiref" +version = "3.8.1" +description = "ASGI specs, helper code, and adapters" +optional = false +python-versions = ">=3.8" +files = [ + {file = "asgiref-3.8.1-py3-none-any.whl", hash = "sha256:3e1e3ecc849832fe52ccf2cb6686b7a55f82bb1d6aee72a58826471390335e47"}, + {file = "asgiref-3.8.1.tar.gz", hash = "sha256:c343bd80a0bec947a9860adb4c432ffa7db769836c64238fc34bdc3fec84d590"}, +] + +[package.dependencies] +typing-extensions = {version = ">=4", markers = "python_version < \"3.11\""} + +[package.extras] +tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"] + [[package]] name = "asttokens" version = "2.4.1" @@ -489,7 +522,7 @@ css = ["tinycss2 (>=1.1.0,<1.3)"] name = "blinker" version = "1.8.2" description = "Fast, simple object-to-object and broadcast signaling" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "blinker-1.8.2-py3-none-any.whl", hash = "sha256:1779309f71bf239144b9399d06ae925637cf6634cf6bd131104184531bf67c01"}, @@ -537,17 +570,6 @@ urllib3 = [ [package.extras] crt = ["awscrt (==0.22.0)"] -[[package]] -name = "cachelib" -version = "0.9.0" -description = "A collection of cache libraries in the same API interface." -optional = false -python-versions = ">=3.7" -files = [ - {file = "cachelib-0.9.0-py3-none-any.whl", hash = "sha256:811ceeb1209d2fe51cd2b62810bd1eccf70feba5c52641532498be5c675493b3"}, - {file = "cachelib-0.9.0.tar.gz", hash = "sha256:38222cc7c1b79a23606de5c2607f4925779e37cdcea1c2ad21b8bae94b5425a5"}, -] - [[package]] name = "cachetools" version = "5.5.0" @@ -1489,6 +1511,26 @@ files = [ [package.dependencies] python-dateutil = ">=2.4" +[[package]] +name = "fastapi" +version = "0.115.2" +description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" +optional = false +python-versions = ">=3.8" +files = [ + {file = "fastapi-0.115.2-py3-none-any.whl", hash = "sha256:61704c71286579cc5a598763905928f24ee98bfcc07aabe84cfefb98812bbc86"}, + {file = "fastapi-0.115.2.tar.gz", hash = "sha256:3995739e0b09fa12f984bce8fa9ae197b35d433750d3d312422d846e283697ee"}, +] + +[package.dependencies] +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0" +starlette = ">=0.37.2,<0.41.0" +typing-extensions = ">=4.8.0" + +[package.extras] +all = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.7)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] +standard = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "jinja2 (>=2.11.2)", "python-multipart (>=0.0.7)", "uvicorn[standard] (>=0.12.0)"] + [[package]] name = "fastcore" version = "1.4.2" @@ -1559,7 +1601,7 @@ typing = ["typing-extensions (>=4.12.2)"] name = "flask" version = "3.0.3" description = "A simple framework for building complex web applications." -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "flask-3.0.3-py3-none-any.whl", hash = "sha256:34e815dfaa43340d1d15a5c3a02b8476004037eb4840b34910c6e21679d288f3"}, @@ -1578,50 +1620,6 @@ Werkzeug = ">=3.0.0" async = ["asgiref (>=3.2)"] dotenv = ["python-dotenv"] -[[package]] -name = "flask-caching" -version = "2.3.0" -description = "Adds caching support to Flask applications." -optional = false -python-versions = ">=3.8" -files = [ - {file = "Flask_Caching-2.3.0-py3-none-any.whl", hash = "sha256:51771c75682e5abc1483b78b96d9131d7941dc669b073852edfa319dd4e29b6e"}, - {file = "flask_caching-2.3.0.tar.gz", hash = "sha256:d7e4ca64a33b49feb339fcdd17e6ba25f5e01168cf885e53790e885f83a4d2cf"}, -] - -[package.dependencies] -cachelib = ">=0.9.0,<0.10.0" -Flask = "*" - -[[package]] -name = "flask-cors" -version = "5.0.0" -description = "A Flask extension adding a decorator for CORS support" -optional = false -python-versions = "*" -files = [ - {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"}, - {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"}, -] - -[package.dependencies] -Flask = ">=0.9" - -[[package]] -name = "flask-sqlalchemy" -version = "3.1.1" -description = "Add SQLAlchemy support to your Flask application." -optional = false -python-versions = ">=3.8" -files = [ - {file = "flask_sqlalchemy-3.1.1-py3-none-any.whl", hash = "sha256:4ba4be7f419dc72f4efd8802d69974803c37259dd42f3913b0dcf75c9447e0a0"}, - {file = "flask_sqlalchemy-3.1.1.tar.gz", hash = "sha256:e4b68bb881802dda1a7d878b2fc84c06d1ee57fb40b874d3dc97dabfa36b8312"}, -] - -[package.dependencies] -flask = ">=2.2.5" -sqlalchemy = ">=2.0.16" - [[package]] name = "fonttools" version = "4.54.1" @@ -2154,34 +2152,33 @@ protobuf = ["grpcio-tools (>=1.67.0)"] [[package]] name = "guardrails-api" -version = "0.0.3" +version = "0.1.0a1" description = "Guardrails API" optional = false python-versions = "<4,>=3.8" files = [ - {file = "guardrails_api-0.0.3-py3-none-any.whl", hash = "sha256:22f2ae1ea34fb36f9407078cd81924f756968477b4f079f0d82b329b1441552f"}, - {file = "guardrails_api-0.0.3.tar.gz", hash = "sha256:f4e9b192ee3f2bfcba1453f4cca9d68aca5ccfe22adc82f104c449b283136b5d"}, + {file = "guardrails_api-0.1.0a1-py3-none-any.whl", hash = "sha256:4685447b2ffbddce77de14d165e28a12437154fabc203678bc33edb2670d647c"}, + {file = "guardrails_api-0.1.0a1.tar.gz", hash = "sha256:0ea1e0b8ac7c240bb58f4649dd547a7772c94a245ff9701f9b688872d0bc3563"}, ] [package.dependencies] +aiocache = ">=0.11.1" boto3 = ">=1.34.115,<2" -flask = ">=3.0.3,<4" -Flask-Caching = ">=2.3.0,<3" -Flask-Cors = ">=4.0.1,<6" -Flask-SQLAlchemy = ">=3.1.1,<4" -guardrails-ai = ">=0.5.6" +fastapi = ">=0.114.1" +guardrails-ai = ">=0.5.12" jsonschema = ">=4.22.0,<5" litellm = ">=1.39.3,<2" opentelemetry-api = ">=1.0.0,<2" opentelemetry-exporter-otlp-proto-grpc = ">=1.0.0,<2" opentelemetry-exporter-otlp-proto-http = ">=1.0.0,<2" -opentelemetry-instrumentation-flask = ">=0.12b0,<1" +opentelemetry-instrumentation-fastapi = ">=0.48b0" opentelemetry-sdk = ">=1.0.0,<2" psycopg2-binary = ">=2.9.9,<3" referencing = ">=0.35.1,<1" requests = ">=2.32.3" +SQLAlchemy = ">=2.0.34" typer = ">=0.9.4,<1" -Werkzeug = ">=3.0.3,<4" +uvicorn = ">=0.30.6" [package.extras] dev = ["coverage", "gunicorn (>=22.0.0,<23)", "pytest", "pytest-mock", "ruff"] @@ -2547,7 +2544,7 @@ arrow = ">=0.15.0" name = "itsdangerous" version = "2.2.0" description = "Safely pass data to untrusted environments and back." -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef"}, @@ -2760,7 +2757,7 @@ files = [ name = "joblib" version = "1.4.2" description = "Lightweight pipelining with Python functions" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, @@ -4403,31 +4400,6 @@ files = [ {file = "nh3-0.2.18.tar.gz", hash = "sha256:94a166927e53972a9698af9542ace4e38b9de50c34352b962f4d9a7d4c927af4"}, ] -[[package]] -name = "nltk" -version = "3.8.1" -description = "Natural Language Toolkit" -optional = false -python-versions = ">=3.7" -files = [ - {file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"}, - {file = "nltk-3.8.1.zip", hash = "sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3"}, -] - -[package.dependencies] -click = "*" -joblib = "*" -regex = ">=2021.8.3" -tqdm = "*" - -[package.extras] -all = ["matplotlib", "numpy", "pyparsing", "python-crfsuite", "requests", "scikit-learn", "scipy", "twython"] -corenlp = ["requests"] -machine-learning = ["numpy", "python-crfsuite", "scikit-learn", "scipy"] -plot = ["matplotlib"] -tgrep = ["pyparsing"] -twitter = ["twython"] - [[package]] name = "nodeenv" version = "1.9.1" @@ -4844,45 +4816,47 @@ setuptools = ">=16.0" wrapt = ">=1.0.0,<2.0.0" [[package]] -name = "opentelemetry-instrumentation-flask" +name = "opentelemetry-instrumentation-asgi" version = "0.48b0" -description = "Flask instrumentation for OpenTelemetry" +description = "ASGI instrumentation for OpenTelemetry" optional = false python-versions = ">=3.8" files = [ - {file = "opentelemetry_instrumentation_flask-0.48b0-py3-none-any.whl", hash = "sha256:26b045420b9d76e85493b1c23fcf27517972423480dc6cf78fd6924248ba5808"}, - {file = "opentelemetry_instrumentation_flask-0.48b0.tar.gz", hash = "sha256:e03a34428071aebf4864ea6c6a564acef64f88c13eb3818e64ea90da61266c3d"}, + {file = "opentelemetry_instrumentation_asgi-0.48b0-py3-none-any.whl", hash = "sha256:ddb1b5fc800ae66e85a4e2eca4d9ecd66367a8c7b556169d9e7b57e10676e44d"}, + {file = "opentelemetry_instrumentation_asgi-0.48b0.tar.gz", hash = "sha256:04c32174b23c7fa72ddfe192dad874954968a6a924608079af9952964ecdf785"}, ] [package.dependencies] -importlib-metadata = ">=4.0" +asgiref = ">=3.0,<4.0" opentelemetry-api = ">=1.12,<2.0" opentelemetry-instrumentation = "0.48b0" -opentelemetry-instrumentation-wsgi = "0.48b0" opentelemetry-semantic-conventions = "0.48b0" opentelemetry-util-http = "0.48b0" -packaging = ">=21.0" [package.extras] -instruments = ["flask (>=1.0)"] +instruments = ["asgiref (>=3.0,<4.0)"] [[package]] -name = "opentelemetry-instrumentation-wsgi" +name = "opentelemetry-instrumentation-fastapi" version = "0.48b0" -description = "WSGI Middleware for OpenTelemetry" +description = "OpenTelemetry FastAPI Instrumentation" optional = false python-versions = ">=3.8" files = [ - {file = "opentelemetry_instrumentation_wsgi-0.48b0-py3-none-any.whl", hash = "sha256:c6051124d741972090fe94b2fa302555e1e2a22e9cdda32dd39ed49a5b34e0c6"}, - {file = "opentelemetry_instrumentation_wsgi-0.48b0.tar.gz", hash = "sha256:1a1e752367b0df4397e0b835839225ef5c2c3c053743a261551af13434fc4d51"}, + {file = "opentelemetry_instrumentation_fastapi-0.48b0-py3-none-any.whl", hash = "sha256:afeb820a59e139d3e5d96619600f11ce0187658b8ae9e3480857dd790bc024f2"}, + {file = "opentelemetry_instrumentation_fastapi-0.48b0.tar.gz", hash = "sha256:21a72563ea412c0b535815aeed75fc580240f1f02ebc72381cfab672648637a2"}, ] [package.dependencies] opentelemetry-api = ">=1.12,<2.0" opentelemetry-instrumentation = "0.48b0" +opentelemetry-instrumentation-asgi = "0.48b0" opentelemetry-semantic-conventions = "0.48b0" opentelemetry-util-http = "0.48b0" +[package.extras] +instruments = ["fastapi (>=0.58,<1.0)"] + [[package]] name = "opentelemetry-proto" version = "1.27.0" @@ -7471,6 +7445,24 @@ pure-eval = "*" [package.extras] tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] +[[package]] +name = "starlette" +version = "0.40.0" +description = "The little ASGI library that shines." +optional = false +python-versions = ">=3.8" +files = [ + {file = "starlette-0.40.0-py3-none-any.whl", hash = "sha256:c494a22fae73805376ea6bf88439783ecfba9aac88a43911b48c653437e784c4"}, + {file = "starlette-0.40.0.tar.gz", hash = "sha256:1a3139688fb298ce5e2d661d37046a66ad996ce94be4d4983be019a23a04ea35"}, +] + +[package.dependencies] +anyio = ">=3.4.0,<5" +typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""} + +[package.extras] +full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7)", "pyyaml"] + [[package]] name = "sympy" version = "1.13.1" @@ -8105,6 +8097,25 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17. socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "uvicorn" +version = "0.32.0" +description = "The lightning-fast ASGI server." +optional = false +python-versions = ">=3.8" +files = [ + {file = "uvicorn-0.32.0-py3-none-any.whl", hash = "sha256:60b8f3a5ac027dcd31448f411ced12b5ef452c646f76f02f8cc3f25d8d26fd82"}, + {file = "uvicorn-0.32.0.tar.gz", hash = "sha256:f78b36b143c16f54ccdb8190d0a26b5f1901fe5a3c777e1ab29f26391af8551e"}, +] + +[package.dependencies] +click = ">=7.0" +h11 = ">=0.8" +typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} + +[package.extras] +standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] + [[package]] name = "uvloop" version = "0.20.0" @@ -8283,7 +8294,7 @@ test = ["websockets"] name = "werkzeug" version = "3.0.4" description = "The comprehensive WSGI web application library." -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "werkzeug-3.0.4-py3-none-any.whl", hash = "sha256:02c9eb92b7d6c06f31a782811505d2157837cea66aaede3e217c7c27c039476c"}, @@ -8663,4 +8674,4 @@ vectordb = ["faiss-cpu", "numpy"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "a662a95423df55ea429547ebc2d91c2fbd38fd3da3b7ef05ac9c790bbdb7b2e0" +content-hash = "00dc0d05363d8bc86320d18440956938bb6554c7c861b7abf58c616789874fbc" diff --git a/pyproject.toml b/pyproject.toml index 625d4a962..8b6d8ac26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "guardrails-ai" -version = "0.5.15" +version = "0.6.0-alpha1" description = "Adding guardrails to large language models." authors = ["Guardrails AI "] license = "Apache License 2.0" @@ -28,7 +28,6 @@ rstr = "^3.2.2" typing-extensions = "^4.8.0" python-dateutil = "^2.8.2" tiktoken = ">=0.5.1" -nltk = ">3.0, <=3.8.1" litellm = "^1.37.14" sqlvalidator = {version = "^0.0.20", optional = true} sqlalchemy = {version = ">=2.0.9", optional = true} @@ -58,7 +57,7 @@ opentelemetry-exporter-otlp-proto-http = "^1.24.0" guardrails-hub-types = "^0.0.4" guardrails-api-client = "^0.3.13" diff-match-patch = "^20230430" -guardrails-api = "^0.0.3" +guardrails-api = ">=0.1.0a1,<0.2.0" mlflow = {version = "^2.0.1", optional = true} uvloop = {version = "^0.20.0", optional = true} semver = "^3.0.2" diff --git a/tests/integration_tests/test_assets/validators/detect_pii.py b/tests/integration_tests/test_assets/validators/detect_pii.py index ddcfb87cd..2da94e7d6 100644 --- a/tests/integration_tests/test_assets/validators/detect_pii.py +++ b/tests/integration_tests/test_assets/validators/detect_pii.py @@ -1,6 +1,5 @@ from typing import Any, Callable, Dict, List, Union import difflib -import nltk from guardrails.validator_base import ( FailResult, @@ -65,6 +64,14 @@ def chunking_function(self, chunk: str): Because using the tokenizer is expensive, we only use it if there is a period present in the chunk. """ + try: + import nltk + except ImportError: + raise ImportError( + "nltk is required for sentence splitting. Please install it using " + "`poetry add nltk`" + ) + # using the sentence tokenizer is expensive # we check for a . to avoid wastefully calling the tokenizer if "." not in chunk: diff --git a/tests/integration_tests/test_async_streaming.py b/tests/integration_tests/test_async_streaming.py index cb6fe18c0..e09ab47ff 100644 --- a/tests/integration_tests/test_async_streaming.py +++ b/tests/integration_tests/test_async_streaming.py @@ -176,8 +176,7 @@ async def test_async_streaming_fix_behavior_two_validators(mocker): assert ( text == """, under golden bridges, roams, - hills, his home. -dreams of fog, and salty air, + hills, his home.dreams of fog, and salty air, in his heart, he's always there.""" ) assert ( diff --git a/tests/integration_tests/test_streaming.py b/tests/integration_tests/test_streaming.py index 9ae025146..cc3d92181 100644 --- a/tests/integration_tests/test_streaming.py +++ b/tests/integration_tests/test_streaming.py @@ -487,8 +487,7 @@ def test_fix_behavior_one_validator(mocker): assert ( text == """"john, under golden bridges, roams, -san francisco's hills, his home. -dreams of fog, and salty air, +san francisco's hills, his home.dreams of fog, and salty air, in his heart, he's always there.""" ) assert ( @@ -530,8 +529,7 @@ def test_fix_behavior_two_validators(mocker): assert ( text == """", under golden bridges, roams, - hills, his home. -dreams of fog, and salty air, + hills, his home.dreams of fog, and salty air, in his heart, he's always there.""" ) assert ( @@ -584,8 +582,7 @@ def test_fix_behavior_three_validators(mocker): assert ( text == """"REDACTED!!, under purple!! bridges, roams, - hills, his home. -dreams of fog, and salty air, + hills, his home.dreams of fog, and salty air, in his heart, he's always there.""" ) assert ( diff --git a/tests/unit_tests/classes/test_rc.py b/tests/unit_tests/classes/test_rc.py index fb71471f0..3ade2e8a7 100644 --- a/tests/unit_tests/classes/test_rc.py +++ b/tests/unit_tests/classes/test_rc.py @@ -4,11 +4,6 @@ class TestRC: def test_load(self, mocker): - # TODO: Re-enable this once we move nltk.download calls to individual validator repos. # noqa - # Right now, it fires during our import chain, causing this to blow up - mocker.patch("nltk.data.find") - mocker.patch("nltk.download") - expanduser_mock = mocker.patch("guardrails.classes.rc.expanduser") expanduser_mock.return_value = "/Home" diff --git a/tests/unit_tests/cli/test_configure.py b/tests/unit_tests/cli/test_configure.py index e241fa0f7..037efbbcb 100644 --- a/tests/unit_tests/cli/test_configure.py +++ b/tests/unit_tests/cli/test_configure.py @@ -61,11 +61,6 @@ def test_configure(mocker, runner, expected_token, enable_metrics, clear_token): def test_save_configuration_file(mocker): - # TODO: Re-enable this once we move nltk.download calls to individual validator repos. # noqa - # Right now, it fires during our import chain, causing this to blow up - mocker.patch("nltk.data.find") - mocker.patch("nltk.download") - expanduser_mock = mocker.patch("guardrails.cli.configure.expanduser") expanduser_mock.return_value = "/Home" diff --git a/tests/unit_tests/cli/test_validate.py b/tests/unit_tests/cli/test_validate.py index 1368c2ef7..2235009e7 100644 --- a/tests/unit_tests/cli/test_validate.py +++ b/tests/unit_tests/cli/test_validate.py @@ -2,9 +2,6 @@ def test_validate(mocker): - mocker.patch("nltk.data.find") - mocker.patch("nltk.download") - mock_validate_llm_output = mocker.patch( "guardrails.cli.validate.validate_llm_output" ) diff --git a/tests/unit_tests/utils/test_docs_utils.py b/tests/unit_tests/utils/test_docs_utils.py index 6f03857be..9cbf10e00 100644 --- a/tests/unit_tests/utils/test_docs_utils.py +++ b/tests/unit_tests/utils/test_docs_utils.py @@ -48,19 +48,6 @@ def test_text_splitter_split(mock_tokenizer): assert chunks[3] == "10 11 12 13 14 15 16 17" -# @patch('nltk.data.find', side_effect=LookupError) -# def test_sentence_split_nltk_download_error(mock_nltk_find): -# with pytest.raises(ImportError): -# sentence_split("This is a test sentence.") - -# @patch('nltk.data.find') -# def test_sentence_split(mock_nltk_find): -# mock_nltk_find.return_value = True -# result = sentence_split("This is a test sentence.") -# assert len(result) == 1 -# assert result[0] == "This is a test sentence." - - def test_prompt_template_token_length(mock_tokenizer, mock_prompt_template): text_splitter = TextSplitter() length = text_splitter.prompt_template_token_length(mock_prompt_template)