Merge pull request #5 from guardrails-ai/jc/change_behavior

JosephCatrambone · web-flow · commit 90061b53d7f7 · 2024-12-06T13:01:37.000-08:00
Fix 'OnFix' behavior.
diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml
@@ -0,0 +1,18 @@
+name: Publish to Guardrails Hub
+
+on:
+  workflow_dispatch:
+  push:
+    # Publish when new releases are tagged.
+    tags:
+      - '*'
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Build & Deploy
+        uses: guardrails-ai/guardrails/.github/actions/validator_pypi_publish@main
+        with:
+          guardrails_token: ${{ secrets.GR_GUARDRAILS_TOKEN }}
+          validator_id: guardrails/bias_check
diff --git a/README.md b/README.md
@@ -17,7 +17,11 @@ This validator can be used to ensure fairness of model output across various dem
 
 * Dependencies:
     - guardrails-ai>=0.5.0
-    - transformers>=4.40
+    - transformers>=4.37
+    - tf-keras
+    - sentencepiece
+    - tensorflow>=2.16
+    - sentence-splitter
         
 * Dev Dependencies:
     - pytest
@@ -50,5 +54,5 @@ guard = Guard().use(
     BiasCheck(threshold=0.9, on_fail="exception")
 )
 
-guard.validate("The movie was great!") # Validator passes
-guard.validate("Why do men always think the movie was great?")  # Validator fails
+guard.validate("The movie was great.") # Validator passes
+guard.validate("Why do men think that movie was great?")  # Validator fails
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,17 +1,20 @@
 [project]
 name = "bias_check"
-version = "0.2.0"
+version = "0.3.0"
 description = "Wrapping a bias check model into a validator."
 authors = [
     {name = "Guardrails AI", email = "contact@guardrailsai.com"}
 ]
 license = {file = "LICENSE"}
 readme = "README.md"
-requires-python = ">= 3.8.1"
+requires-python = ">= 3.9"
 dependencies = [
-    "guardrails-ai>=0.4.0",
-    "transformers>=4.40.2",
-    "tensorflow>=2.18.0"
+    "guardrails-ai>=0.5.15",
+    "transformers>=4.37.0",
+    "tf-keras",
+    "sentencepiece",
+    "tensorflow>=2.16.0",  # Required for the dbias model, but not as a direct dependency.
+    "sentence-splitter>=1.4"
 ]
 
 [project.optional-dependencies]
diff --git a/tests/test_validator.py b/tests/test_validator.py
@@ -3,20 +3,26 @@
 
 from validator import BiasCheck
 
-# We use 'exception' as the validator's fail action,
-#  so we expect failures to always raise an Exception
-# Learn more about corrective actions here:
-#  https://www.guardrailsai.com/docs/concepts/output/#%EF%B8%8F-specifying-corrective-actions
+
 def test_success_case():
-  validator = BiasCheck()
-  input_text = "The sun rises in the morning."
-  result = validator.validate(input_text, {})
-  assert isinstance(result, PassResult)
+    validator = BiasCheck()
+    input_text = "The sun rises in the morning."
+    result = validator.validate(input_text, {})
+    assert isinstance(result, PassResult)
+
 
 def test_failure_case():
-  validator = BiasCheck(threshold=0.5)
-  input_text = "The sun only rises for Humanists."
-  result = validator.validate(input_text, {})
-  assert isinstance(result, FailResult)
-  assert result.error_message == "The original response contains potentially biased messages:\nThe sun only rises for Humanists.\n (Message scores: 0.6546186208724976)"
-  assert result.fix_value == ""
+    validator = BiasCheck(threshold=0.0)
+    input_text = "Men these days don't care about my arbitrary and deletarious standards of gender. They only care about emotional honesty and participating in a productive, healthy society. smh"
+    result = validator.validate(input_text, {})
+    assert isinstance(result, FailResult)
+    assert result.error_message == "The original response contains potentially biased messages:\nMen these days don't care about my arbitrary and deletarious standards of gender. They only care about emotional honesty and participating in a productive, healthy society. smh\n (Message scores: 0.9477301836013794)"
+    assert result.fix_value == ""
+
+
+def test_sentence_fix():
+    v = BiasCheck(on_fail='fix', threshold=0.9)
+    input_text = "Men these days don't care about my arbitrary and deletarious standards of gender. They only care about emotional honesty and participating in a productive, healthy society. smh"
+    out = v.validate(input_text)
+    assert isinstance(out, FailResult)
+    assert out.fix_value == "Men these days don't care about my arbitrary and deletarious standards of gender."
diff --git a/validator/main.py b/validator/main.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, Optional
+from typing import Callable, Dict, List, Optional, Union
 
 from guardrails.validator_base import (
     FailResult,
@@ -7,9 +7,11 @@
     Validator,
     register_validator,
 )
-
+from guardrails.types import OnFailAction
+from sentence_splitter import split_text_into_sentences
 from transformers import pipeline
 
+
 @register_validator(name="guardrails/bias_check", data_type="string")
 class BiasCheck(Validator):
     """Validates that the text is free from biases related to age, gender, sex, ethnicity, religion, etc.
@@ -23,61 +25,107 @@ class BiasCheck(Validator):
     | Programmatic fix              | The debiased text if bias is detected |
 
     Args:
-        threshold (float): Higher is more likely to allow bias. Lower is more sensitive and more likely to flag biased messages.
-        on_fail (Callable): The policy to enact when a validator fails. If `str`, must be one of `filter`, `noop`, or `exception`. Otherwise, must be a function that is called when the validator fails.
+        threshold (float): Higher is more likely to allow bias. Lower is more sensitive and more likely to flag biased messages. 
+        on_fail (Callable): The policy to enact when a validator fails. If `str`, must be one of `noop`, `fix`, or `exception`. Otherwise, must be a function that is called when the validator fails.
     """  # noqa
 
     def __init__(
         self,
         threshold: float = 0.9,
-        on_fail: Optional[Callable] = None,
+        on_fail: Optional[Union[str, Callable]] = None,
     ):
-        super().__init__(on_fail=on_fail)
-        valid_on_fail_operations = {"filter", "noop", "exception"}
+        super().__init__(on_fail=on_fail)  # type: ignore
+        valid_on_fail_operations = {"fix", "noop", "exception"}
         if isinstance(on_fail, str) and on_fail not in valid_on_fail_operations:
             raise Exception(
                 f"on_fail value ({on_fail}) not in list of allowable operations: {valid_on_fail_operations}"
             )
         self.threshold = threshold
-        self.model = pipeline(
+
+        # There are some spurious loading complaints with TFDistilBert models.
+        # See https://discuss.huggingface.co/t/message-some-layers-from-the-model-were-not-used/1972/7
+        self.classification_model = pipeline(
             'text-classification',
             model="d4data/bias-detection-model",
+            tokenizer="d4data/bias-detection-model",
         )
-        # There are some spurious loading complaints with TFDistilBert models.
-        # See https://discuss.huggingface.co/t/message-some-layers-from-the-model-were-not-used/1972/7
 
-    def validate(self, value: Any, metadata: Optional[Dict] = None) -> ValidationResult:
+    def validate(
+            self,
+            value: Union[str, List[str]],
+            metadata: Optional[Dict] = None
+    ) -> ValidationResult:
         """Validates that the text is free from biases related to age, gender, sex, ethnicity, religion, etc."""
+        single_sentence_passed = False
         if isinstance(value, str):
+            single_sentence_passed = True
             value = [value,]  # Ensure we're always passing lists of strings into the classifier.
 
-        classified_examples = self.model(value)
+        scores = self._inference(value)
         passing_outputs = list()
         passing_scores = list()
         failing_outputs = list()
         failing_scores = list()
-        for text, prediction in zip(value, classified_examples):
-            if prediction['label'] == 'Biased':
-                score = prediction['score']
-            elif prediction['label'] == 'Non-biased':
-                score = -prediction['score']  # Note the negation!
-            else:
-                raise Exception(f"Got unexpected prediction label: {prediction['label']}")
+        all_outputs = list()  # A tuple of (fix/ignore, sentence)
+        for text, score in zip(value, scores):
             if score > self.threshold:
                 failing_outputs.append(text)
                 failing_scores.append(score)
             else:
                 passing_outputs.append(text)
                 passing_scores.append(score)
+            all_outputs.append((score > self.threshold, text))
 
         if failing_outputs:
             failure_message = "The original response contains potentially biased messages:\n"
             failure_message += "\n - ".join(failing_outputs)
             message_scores = [str(s) for s in failing_scores]
             failure_message += "\n (Message scores: {})".format(", ".join(message_scores))
-            # Do we need to call the on_fail_method here?
+            # Three paths: noop, exception, fix.
+            # on_fail == NOOP, return only passing passages.
+            # on_fail == FIX, split passages into sentences and drop sentences.
+            # EXCEPTION is handled farther up the stack.
+            if self.on_fail_descriptor != OnFailAction.FIX:
+                fix_value = passing_outputs
+            else:
+                fix_value = list()
+                for needs_fix, text in all_outputs:
+                    if not needs_fix:
+                        fix_value.append(text)
+                    else:
+                        # The 'text' is a full document, passage, or paragraph.
+                        fix_value.append(self.fix_passage(text))
             return FailResult(
                 error_message=failure_message,
-                fix_value=" ".join(passing_outputs),
+                fix_value=" ".join(fix_value) if single_sentence_passed else fix_value,
             )
         return PassResult()
+
+    def fix_passage(self, text: str) -> str:
+        """Given a passage of text, split it into sentences, evaluate each for bias,
+        then recombine them and return a new paragraph. May not preserve whitespace
+        between sentences."""
+        sentences = split_text_into_sentences(text, language='en')
+        scores = self._inference(sentences)
+        unbiased_sentences = list()
+        for score, sentence in zip(scores, sentences):
+            if score < self.threshold:
+                unbiased_sentences.append(sentence)
+        return "  ".join(unbiased_sentences)
+
+    # This normally will be called by _inference.
+    # Remote inference is unsupported for this model on account of the NER.
+    def _inference_local(self, sentences: List[str]) -> List[float]:  # type: ignore
+        scores = list()
+        predictions = self.classification_model(sentences)
+        for pred in predictions:
+            label = pred['label']  # type: ignore
+            score = pred['score']  # type: ignore
+            if label == 'Biased':
+                scores.append(score)
+            elif label == 'Non-biased':
+                scores.append(-score)
+            else:
+                # This should never happen:
+                raise Exception("Unexpected prediction label: {}".format(label))
+        return scores
diff --git a/validator/post-install.py b/validator/post-install.py
@@ -1,4 +1,8 @@
 from transformers import pipeline
 print("post-install starting...")
-_ = pipeline("text-classification", "d4data/bias-detection-model")
+_ = pipeline(
+    'text-classification',
+    model="d4data/bias-detection-model",
+    tokenizer="d4data/bias-detection-model",
+)
 print("post-install complete!")