guardrails-ai · CalebCourier · Jun 3, 2024 · May 7, 2024 · May 7, 2024 · May 8, 2024
diff --git a/guardrails/run/runner.py b/guardrails/run/runner.py
@@ -547,17 +547,29 @@ def validate(
         index: int,
         parsed_output: Any,
         output_schema: Schema,
+        stream: Optional[bool] = False,
         **kwargs,
     ):
         """Validate the output."""
-        validated_output = output_schema.validate(
-            iteration,
-            parsed_output,
-            self.metadata,
-            attempt_number=index,
-            disable_tracer=self._disable_tracer,
-            **kwargs,
-        )
+        if isinstance(output_schema, StringSchema):
+            validated_output = output_schema.validate(
+                iteration,
+                parsed_output,
+                self.metadata,
+                index,
+                self._disable_tracer,
+                stream,
+                **kwargs,
+            )
+        else:
+            validated_output = output_schema.validate(
+                iteration,
+                parsed_output,
+                self.metadata,
+                attempt_number=index,
+                disable_tracer=self._disable_tracer,
+                **kwargs,
+            )
 
         return validated_output
 

diff --git a/guardrails/run/stream_runner.py b/guardrails/run/stream_runner.py
@@ -153,47 +153,128 @@ def step(
         verified = set()
         # Loop over the stream
         # and construct "fragments" of concatenated chunks
-        for chunk in stream:
-            # 1. Get the text from the chunk and append to fragment
-            chunk_text = self.get_chunk_text(chunk, api)
-            fragment += chunk_text
+        # for now, handle string and json schema differently
 
-            # 2. Parse the fragment
-            parsed_fragment, move_to_next = self.parse(
-                index, fragment, output_schema, verified
-            )
-            if move_to_next:
-                # Continue to next chunk
-                continue
+        if isinstance(output_schema, StringSchema):
+            for chunk in stream:
+                print('chunk', chunk)
+                # 1. Get the text from the chunk and append to fragment
+                chunk_text = self.get_chunk_text(chunk, api)
+                fragment += chunk_text
+
+                # 2. Parse the chunk
+                parsed_chunk, move_to_next = self.parse(
+                    index, chunk_text, output_schema, verified
+                )
+                if move_to_next:
+                    # Continue to next chunk
+                    continue
+                validated_result = self.validate(
+                    iteration,
+                    index,
+                    parsed_chunk,
+                    output_schema,
+                    True,
+                    validate_subschema=True,
+                )
+                if isinstance(validated_result, SkeletonReAsk):
+                    raise ValueError(
+                        "Received fragment schema is an invalid sub-schema "
+                        "of the expected output JSON schema."
+                    )
 
-            # 3. Run output validation
-            validated_fragment = self.validate(
+                # 4. Introspect: inspect the validated fragment for reasks
+                reasks, valid_op = self.introspect(
+                    index, validated_result, output_schema
+                )
+                if reasks:
+                    raise ValueError(
+                        "Reasks are not yet supported with streaming. Please "
+                        "remove reasks from schema or disable streaming."
+                    )
+                # 5. Convert validated fragment to a pretty JSON string
+                yield ValidationOutcome(
+                    #  The chunk or the whole output?
+                    raw_llm_output=chunk,
+                    validated_output=validated_result,
+                    validation_passed=validated_fragment is not None,
+                )
+            ######################################
+            # need to validate remainder of chunks
+            ######################################
+            remainder_validation = self.validate(
                 iteration,
                 index,
-                parsed_fragment,
+                "",
                 output_schema,
+                True,
                 validate_subschema=True,
+                remainder=True,
             )
-            if isinstance(validated_fragment, SkeletonReAsk):
+            if isinstance(remainder_validation, SkeletonReAsk):
                 raise ValueError(
                     "Received fragment schema is an invalid sub-schema "
                     "of the expected output JSON schema."
                 )
 
             # 4. Introspect: inspect the validated fragment for reasks
-            reasks, valid_op = self.introspect(index, validated_fragment, output_schema)
+            reasks, valid_op = self.introspect(
+                index, remainder_validation, output_schema
+            )
             if reasks:
                 raise ValueError(
                     "Reasks are not yet supported with streaming. Please "
                     "remove reasks from schema or disable streaming."
                 )
-
             # 5. Convert validated fragment to a pretty JSON string
             yield ValidationOutcome(
-                raw_llm_output=fragment,
-                validated_output=validated_fragment,
-                validation_passed=validated_fragment is not None,
+                #  The chunk or the whole output?
+                raw_llm_output=chunk,
+                validated_output=remainder_validation,
+                validation_passed=remainder_validation is not None,
             )
+        # handle non string schema
+        else:
+            for chunk in stream:
+                # 1. Get the text from the chunk and append to fragment
+                chunk_text = self.get_chunk_text(chunk, api)
+                fragment += chunk_text
+
+                parsed_fragment, move_to_next = self.parse(
+                    index, fragment, output_schema, verified
+                )
+                if move_to_next:
+                    # Continue to next chunk
+                    continue
+                validated_fragment = self.validate(
+                    iteration,
+                    index,
+                    parsed_fragment,
+                    output_schema,
+                    validate_subschema=True,
+                )
+                if isinstance(validated_fragment, SkeletonReAsk):
+                    raise ValueError(
+                        "Received fragment schema is an invalid sub-schema "
+                        "of the expected output JSON schema."
+                    )
+
+                # 4. Introspect: inspect the validated fragment for reasks
+                reasks, valid_op = self.introspect(
+                    index, validated_fragment, output_schema
+                )
+                if reasks:
+                    raise ValueError(
+                        "Reasks are not yet supported with streaming. Please "
+                        "remove reasks from schema or disable streaming."
+                    )
+
+                # 5. Convert validated fragment to a pretty JSON string
+                yield ValidationOutcome(
+                    raw_llm_output=fragment,
+                    validated_output=validated_fragment,
+                    validation_passed=validated_fragment is not None,
+                )
 
         # Finally, add to logs
         iteration.outputs.raw_output = fragment

diff --git a/guardrails/schema/string_schema.py b/guardrails/schema/string_schema.py
@@ -134,6 +134,7 @@ def validate(
         metadata: Dict,
         attempt_number: int = 0,
         disable_tracer: Optional[bool] = True,
+        stream: Optional[bool] = False,
         **kwargs,
     ) -> Any:
         """Validate a dictionary of data against the schema.
@@ -160,19 +161,20 @@ def validate(
                 dummy_key: data,
             },
         )
-
         validated_response, metadata = validator_service.validate(
             value=data,
             metadata=metadata,
             validator_setup=validation,
             iteration=iteration,
             disable_tracer=disable_tracer,
+            stream=stream,
+            **kwargs,
         )
 
         validated_response = {dummy_key: validated_response}
 
         if check_refrain_in_dict(validated_response):
-            # If the data contains a `Refain` value, we return an empty
+            # If the data contains a `Refrain` value, we return an empty
             # dictionary.
             logger.debug("Refrain detected.")
             validated_response = {}

diff --git a/guardrails/validator_base.py b/guardrails/validator_base.py
@@ -1,4 +1,5 @@
 import inspect
+import nltk
 from collections import defaultdict
 from copy import deepcopy
 from enum import Enum
@@ -175,6 +176,28 @@ class Refrain:
     pass
 
 
+# functions to get chunks
+def split_word(chunk: str):
+    return list(map(lambda x: x + " ", chunk.split(" ")))[:-1]
+
+
+def split_sentence(chunk: str):
+    # using the sentence tokenizer is expensive
+    # we check for a . to avoid wastefully calling the tokenizer
+    if "." not in chunk:
+        return []
+    sentences = nltk.sent_tokenize(chunk)
+    if len(sentences) == 0:
+        return []
+    # return the sentence
+    # then the remaining chunks that aren't finished accumulating
+    return [sentences[0], "".join(sentences[1:])]
+
+
+def split_paragraph(chunk: str):
+    return list(map(lambda x: x + "\n", chunk.split("\n")))[:-1]
+
+
 def check_refrain_in_list(schema: List) -> bool:
     """Checks if a Refrain object exists in a list.
 
@@ -390,6 +413,10 @@ class Validator(Runnable):
 
     rail_alias: str = ""
 
+    # chunking function returns empty list or list of 2 chunks
+    # first chunk is the chunk to validate
+    # second chunk is incomplete chunk that needs further accumulation
+    accumulated_chunks = []
     run_in_separate_process = False
     override_value_on_pass = False
     required_metadata_keys = []
@@ -448,10 +475,47 @@ def __init__(
             self.rail_alias in validators_registry
         ), f"Validator {self.__class__.__name__} is not registered. "
 
+    def chunking_function(self, chunk: str):
+        return split_sentence(chunk)
+
     def validate(self, value: Any, metadata: Dict[str, Any]) -> ValidationResult:
         """Validates a value and return a validation result."""
         raise NotImplementedError
 
+    def validate_stream(
+        self, chunk: Any, metadata: Dict[str, Any], **kwargs
+    ) -> ValidationResult:
+        """Validates a chunk emitted by an LLM.
+        If the LLM chunk is smaller than the validator's chunking strategy,
+        it will be accumulated until it reaches the desired size. In the meantime,
+        the validator will return None.
+
+        If the LLM chunk is larger than the validator's chunking strategy,
+        it will split it into validator-sized chunks and validate each one,
+        returning an array of validation results.
+
+        Otherwise, the validator will validate the chunk and return the result.
+        """
+        # combine accumulated chunks and new chunk
+        self.accumulated_chunks.append(chunk)
+        accumulated_text = "".join(self.accumulated_chunks)
+        # check if enough chunks have accumulated for validation
+        splitcontents = self.chunking_function(accumulated_text)
+
+        # if remainder kwargs is passed, validate remainder regardless
+        remainder = kwargs.get("remainder", False)
+        if remainder:
+            splitcontents = [accumulated_text, []]
+        if len(splitcontents) == 0:
+            return None
+        [chunk_to_validate, new_accumulated_chunks] = splitcontents
+        self.accumulated_chunks = new_accumulated_chunks
+        # exclude last chunk, because it may not be a complete chunk
+        validation_result = self.validate(chunk_to_validate, metadata)
+        # include the chunk that we've validated in the metadata
+        validation_result.metadata["validated_chunk"] = chunk_to_validate
+        return validation_result
+
     def to_prompt(self, with_keywords: bool = True) -> str:
         """Convert the validator to a prompt.