guardrails-ai · CalebCourier · Jun 3, 2024 · May 7, 2024 · May 7, 2024 · May 8, 2024
diff --git a/guardrails/run/stream_runner.py b/guardrails/run/stream_runner.py
@@ -158,19 +158,27 @@ def step(
             chunk_text = self.get_chunk_text(chunk, api)
             fragment += chunk_text
 
-            # 2. Parse the fragment
-            parsed_fragment, move_to_next = self.parse(
-                index, fragment, output_schema, verified
+            # 2. Parse the chunk
+            # I assume we have to parse the chunk before validating it...
+            parsed_chunk, move_to_next = self.parse(
+                index, chunk, output_schema, verified
             )
             if move_to_next:
                 # Continue to next chunk
                 continue
 
             # 3. Run output validation
+            # If validator chunk size is smaller than LLM chunk size:
+            # split llm chunk down into validator-sized chunks
+            # Question: How can I tell what the validator chunk size is?
+
+            # If validator chunk size is larger, pass to validator. 
+            # Validator will return None until it's accumulated enough
+            # Don't forget to validate incomplete chunks at the end.
             validated_fragment = self.validate(
                 iteration,
                 index,
-                parsed_fragment,
+                parsed_chunk,
                 output_schema,
                 validate_subschema=True,
             )

diff --git a/guardrails/schema/string_schema.py b/guardrails/schema/string_schema.py
@@ -136,6 +136,9 @@ def validate(
         disable_tracer: Optional[bool] = True,
         **kwargs,
     ) -> Any:
+        # TODO: add class field to track number of chunks accumulated
+        # If not enough chunks have been accumulated, emit None
+        # Once enough chunks have been accumulated, validate and emit the result
         """Validate a dictionary of data against the schema.
 
         Args:

diff --git a/guardrails/validator_base.py b/guardrails/validator_base.py
@@ -7,6 +7,7 @@
     Any,
     Callable,
     Dict,
+    Iterable,
     List,
     Literal,
     Optional,
@@ -26,6 +27,8 @@
 from guardrails.errors import ValidationError
 from guardrails.utils.dataclass import dataclass
 
+VALIDATOR_CHUNKING_STRATEGIES = Enum('VALIDATOR_CHUNKING_STRATEGIES', ['WORD', 'SENTENCE', 'PARAGRAPH'])
+
 VALIDATOR_IMPORT_WARNING = """Accessing `{validator_name}` using
 `from guardrails.validators import {validator_name}` is deprecated and
 support will be removed after version 0.5.x. Please switch to the Guardrails Hub syntax:
@@ -174,6 +177,14 @@ class Filter:
 class Refrain:
     pass
 
+def is_word(chunk:str) -> bool:
+    return ' ' in chunk
+
+def is_sentence(chunk:str) -> bool:
+    return '.' in chunk
+
+def is_paragraph(chunk:str) -> bool:
+    return '\n' in chunk
 
 def check_refrain_in_list(schema: List) -> bool:
     """Checks if a Refrain object exists in a list.
@@ -390,6 +401,8 @@ class Validator(Runnable):
 
     rail_alias: str = ""
 
+    chunking_strategy=VALIDATOR_CHUNKING_STRATEGIES.SENTENCE
+    accumulated_chunks = []
     run_in_separate_process = False
     override_value_on_pass = False
     required_metadata_keys = []
@@ -452,6 +465,38 @@ def validate(self, value: Any, metadata: Dict[str, Any]) -> ValidationResult:
         """Validates a value and return a validation result."""
         raise NotImplementedError
 
+    def validate_stream(self, chunk:Any, metadata: Dict[str, Any]) -> ValidationResult:
+        """Validates a chunk emitted by an LLM.
+        If the LLM chunk is smaller than the validator's chunking strategy, 
+        it will be accumulated until it reaches the desired size. In the meantime, 
+        the validator will return None.
+
+        Otherwise, the validator will validate the chunk and return the result.
+        """
+        # combine accumulated chunks and new chunk
+        self.accumulated_chunks.append(chunk)
+        # check if enough chunks have accumulated for validation
+        accumulated_enough = self.accumulated_enough_to_validate()
+        if not accumulated_enough:
+            return None
+        # if we've accumulated enough chunks, validate the accumulated chunks
+        accumulated_text = ''.join(self.accumulated_chunks)
+        # remove the accummulated chunks
+        self.accumulated_chunks = []
+        return self.validate(accumulated_text, metadata)
+
+
+    def accumulated_enough_to_validate(self) -> bool:
+        accumulated_text = ''.join(self.accumulated_chunks)
+        """Check if the accumulated chunks are large enough to be validated."""
+        if(self.chunking_strategy == VALIDATOR_CHUNKING_STRATEGIES.WORD):
+            return is_word(accumulated_text)
+        if(self.chunking_strategy == VALIDATOR_CHUNKING_STRATEGIES.SENTENCE):
+            return is_sentence(accumulated_text)
+        if(self.chunking_strategy == VALIDATOR_CHUNKING_STRATEGIES.PARAGRAPH):
+            return is_paragraph(accumulated_text)
+
+
     def to_prompt(self, with_keywords: bool = True) -> str:
         """Convert the validator to a prompt.