From 1779f26d4bbd069239a2767fd44e700bc40d2acf Mon Sep 17 00:00:00 2001
From: fg-nava <189638926+fg-nava@users.noreply.github.com>
Date: Tue, 20 May 2025 14:01:45 -0700
Subject: [PATCH 1/6] feat: Add readability python assertion using
 TextDescriptives

---
 .../promptfoo-googlesheet-evaluation.yml      |  11 +-
 app/promptfoo/promptfooconfig.ci.yaml         |   3 +
 app/promptfoo/readability_assessment.py       | 108 ++++++++++++++++++
 3 files changed, 120 insertions(+), 2 deletions(-)
 create mode 100644 app/promptfoo/readability_assessment.py

diff --git a/.github/workflows/promptfoo-googlesheet-evaluation.yml b/.github/workflows/promptfoo-googlesheet-evaluation.yml
index 85252fc3..d51a6522 100644
--- a/.github/workflows/promptfoo-googlesheet-evaluation.yml
+++ b/.github/workflows/promptfoo-googlesheet-evaluation.yml
@@ -10,6 +10,7 @@ on:
       - 'app/src/generate.py'
       - 'app/promptfoo/promptfooconfig.ci.yaml'
       - 'app/promptfoo/generateUniqueId.js'
+      - 'app/promptfoo/readability_assessment.py'
       - '.github/workflows/promptfoo-googlesheet-evaluation.yml'
   workflow_dispatch:
     inputs:
@@ -54,7 +55,12 @@ jobs:
       - name: Install system dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y jq gettext
+          sudo apt-get install -y jq gettext python3-pip
+
+      - name: Install Python dependencies
+        run: |
+          python3 -m pip install textdescriptives spacy
+          python3 -m spacy download en_core_web_sm
 
       - name: Set up Google Cloud credentials
         run: |
@@ -77,6 +83,7 @@ jobs:
       - name: Process config file
         run: |
           cp app/promptfoo/generateUniqueId.js /tmp/generateUniqueId.js
+          cp app/promptfoo/readability_assessment.py /tmp/readability_assessment.py
           envsubst < app/promptfoo/promptfooconfig.ci.yaml > /tmp/promptfooconfig.processed.yaml
           echo "Config file processed, checking..."
           grep -v "GOOGLE_SHEET\|CHATBOT_INSTANCE" /tmp/promptfooconfig.processed.yaml | grep -i "url\|path"
@@ -134,7 +141,7 @@ jobs:
           fi
 
       - name: Create PR comment
-        if: github.event_name == 'pull_request'
+        if: github.event.name == 'pull_request'
         uses: actions/github-script@v7
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/app/promptfoo/promptfooconfig.ci.yaml b/app/promptfoo/promptfooconfig.ci.yaml
index e2673be8..7e0e6ffd 100644
--- a/app/promptfoo/promptfooconfig.ci.yaml
+++ b/app/promptfoo/promptfooconfig.ci.yaml
@@ -22,6 +22,9 @@ defaultTest:
     uniqueSessionId: file:///tmp/generateUniqueId.js
   options:
     timeout: 360000
+  assert:
+    - type: python
+      value: file://tmp/readability_assessment.py
 
 evaluateOptions:
   delay: 1000
diff --git a/app/promptfoo/readability_assessment.py b/app/promptfoo/readability_assessment.py
new file mode 100644
index 00000000..54bfa2e8
--- /dev/null
+++ b/app/promptfoo/readability_assessment.py
@@ -0,0 +1,108 @@
+from typing import Dict, Union, Any
+import textdescriptives as td
+import numpy as np
+
+def get_assert(output: str, context) -> Union[bool, float, Dict[str, Any]]:
+    """
+    Assess the readability of the output text using TextDescriptives instead of py-readability-metrics.
+    Returns a GradingResult with component scores for different readability metrics.
+    """
+    print("=== TEXTDESCRIPTIVES READABILITY ASSESSMENT STARTING ===")
+    print(f"Output to assess: {output}")
+    
+    try:
+        if not output or len(output.strip()) == 0:
+            return {
+                'pass': False,
+                'score': 0.0,
+                'reason': 'Empty or invalid output text'
+            }
+        
+        # Use TextDescriptives to calculate readability metrics
+        metrics_df = td.extract_metrics(
+            text=output, 
+            spacy_model="en_core_web_sm", 
+            metrics=["readability"]
+        )
+        
+        # Extract the readability metrics and convert from numpy types to Python native types
+        flesch_reading_ease = float(metrics_df["flesch_reading_ease"].iloc[0])
+        flesch_kincaid_grade = float(metrics_df["flesch_kincaid_grade"].iloc[0])
+        gunning_fog = float(metrics_df["gunning_fog"].iloc[0])
+        coleman_liau_index = float(metrics_df["coleman_liau_index"].iloc[0])
+        
+        # Set thresholds for readability
+        MAX_GRADE_LEVEL = 12.0  # Maximum acceptable grade level (high school)
+        MIN_FLESCH_EASE = 50.0  # Minimum acceptable Flesch Reading Ease score
+        
+        # Calculate average grade level from metrics
+        grade_levels = [flesch_kincaid_grade, gunning_fog, coleman_liau_index]
+        avg_grade_level = sum(grade_levels) / len(grade_levels)
+        
+        # Determine if the text passes readability requirements
+        passes_grade_level = bool(avg_grade_level <= MAX_GRADE_LEVEL)
+        passes_flesch_ease = bool(flesch_reading_ease >= MIN_FLESCH_EASE)
+        
+        # Calculate normalized score (0-1)
+        grade_level_score = float(max(0, 1 - (avg_grade_level / (MAX_GRADE_LEVEL * 1.5))))
+        flesch_ease_score = float(flesch_reading_ease / 100.0)
+        
+        # Overall score is average of both metrics
+        overall_score = float((grade_level_score + flesch_ease_score) / 2)
+        
+        # Ensure all values are standard Python types, not numpy types
+        def numpy_to_python(obj):
+            if isinstance(obj, np.integer):
+                return int(obj)
+            elif isinstance(obj, np.floating):
+                return float(obj)
+            elif isinstance(obj, np.ndarray):
+                return obj.tolist()
+            elif isinstance(obj, np.bool_):
+                return bool(obj)
+            elif isinstance(obj, dict):
+                return {k: numpy_to_python(v) for k, v in obj.items()}
+            elif isinstance(obj, list):
+                return [numpy_to_python(i) for i in obj]
+            else:
+                return obj
+        
+        # Return comprehensive grading result
+        result = {
+            'pass': passes_grade_level and passes_flesch_ease,
+            'score': overall_score,
+            'reason': f'Readability assessment: Average grade level: {avg_grade_level:.1f}, Flesch ease: {flesch_reading_ease:.1f}',
+            'componentResults': [
+                {
+                    'pass': passes_grade_level,
+                    'score': grade_level_score,
+                    'reason': f'Grade Level (target ≤ {MAX_GRADE_LEVEL}): {avg_grade_level:.1f}'
+                },
+                {
+                    'pass': passes_flesch_ease,
+                    'score': flesch_ease_score,
+                    'reason': f'Flesch Reading Ease (target ≥ {MIN_FLESCH_EASE}): {flesch_reading_ease:.1f}'
+                }
+            ],
+            'namedScores': {
+                'flesch_kincaid_grade': flesch_kincaid_grade,
+                'flesch_ease': flesch_reading_ease,
+                'gunning_fog_grade': gunning_fog,
+                'coleman_liau_grade': coleman_liau_index,
+                'avg_grade_level': avg_grade_level
+            }
+        }
+        
+        # Convert any remaining numpy types to Python native types
+        result = numpy_to_python(result)
+        
+        print("Assessment result:", result)
+        return result
+        
+    except Exception as e:
+        print(f"Error in readability assessment: {str(e)}")
+        return {
+            'pass': False,
+            'score': 0.0,
+            'reason': f'Error in readability assessment: {str(e)}'
+        } 
\ No newline at end of file

From d12b80d46240485aa5f7ea114eec85e8dc430932 Mon Sep 17 00:00:00 2001
From: fg-nava <189638926+fg-nava@users.noreply.github.com>
Date: Tue, 20 May 2025 14:17:02 -0700
Subject: [PATCH 2/6] fix: path/to/ the readbility file

---
 app/promptfoo/promptfooconfig.ci.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/promptfoo/promptfooconfig.ci.yaml b/app/promptfoo/promptfooconfig.ci.yaml
index 7e0e6ffd..2b6f0d13 100644
--- a/app/promptfoo/promptfooconfig.ci.yaml
+++ b/app/promptfoo/promptfooconfig.ci.yaml
@@ -24,7 +24,7 @@ defaultTest:
     timeout: 360000
   assert:
     - type: python
-      value: file://tmp/readability_assessment.py
+      value: file:///tmp/readability_assessment.py
 
 evaluateOptions:
   delay: 1000

From bf17db63ec4a53f7ca60c6f7440f1ca7fd0c5cf1 Mon Sep 17 00:00:00 2001
From: fg-nava <189638926+fg-nava@users.noreply.github.com>
Date: Tue, 20 May 2025 15:06:52 -0700
Subject: [PATCH 3/6] fix: only run python assertion for tagged questions

---
 app/promptfoo/promptfooconfig.ci.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/app/promptfoo/promptfooconfig.ci.yaml b/app/promptfoo/promptfooconfig.ci.yaml
index 2b6f0d13..e2673be8 100644
--- a/app/promptfoo/promptfooconfig.ci.yaml
+++ b/app/promptfoo/promptfooconfig.ci.yaml
@@ -22,9 +22,6 @@ defaultTest:
     uniqueSessionId: file:///tmp/generateUniqueId.js
   options:
     timeout: 360000
-  assert:
-    - type: python
-      value: file:///tmp/readability_assessment.py
 
 evaluateOptions:
   delay: 1000

From 2cea951097552678d18ac2c63110fbdbf376efe0 Mon Sep 17 00:00:00 2001
From: fg-nava <189638926+fg-nava@users.noreply.github.com>
Date: Tue, 20 May 2025 15:25:41 -0700
Subject: [PATCH 4/6] fix: PR comment when event is pull_request

---
 .github/workflows/promptfoo-googlesheet-evaluation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/promptfoo-googlesheet-evaluation.yml b/.github/workflows/promptfoo-googlesheet-evaluation.yml
index d51a6522..062a40fa 100644
--- a/.github/workflows/promptfoo-googlesheet-evaluation.yml
+++ b/.github/workflows/promptfoo-googlesheet-evaluation.yml
@@ -141,7 +141,7 @@ jobs:
           fi
 
       - name: Create PR comment
-        if: github.event.name == 'pull_request'
+        if: github.event_name == 'pull_request'
         uses: actions/github-script@v7
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}

From 29b1b06e8fa1fd5cf953f0d1e14c2a8208e54d7f Mon Sep 17 00:00:00 2001
From: fg-nava <189638926+fg-nava@users.noreply.github.com>
Date: Thu, 22 May 2025 08:46:34 -0700
Subject: [PATCH 5/6] fix: add solutions to YLs comments

---
 app/promptfoo/readability_assessment.py | 80 ++++++++++---------------
 1 file changed, 32 insertions(+), 48 deletions(-)

diff --git a/app/promptfoo/readability_assessment.py b/app/promptfoo/readability_assessment.py
index 54bfa2e8..19bca489 100644
--- a/app/promptfoo/readability_assessment.py
+++ b/app/promptfoo/readability_assessment.py
@@ -2,11 +2,30 @@
 import textdescriptives as td
 import numpy as np
 
-def get_assert(output: str, context) -> Union[bool, float, Dict[str, Any]]:
-    """
-    Assess the readability of the output text using TextDescriptives instead of py-readability-metrics.
-    Returns a GradingResult with component scores for different readability metrics.
-    """
+# Readability thresholds
+MAX_GRADE_LEVEL = 12.0  # Maximum acceptable grade level (high school)
+MIN_FLESCH_EASE = 50.0  # Minimum acceptable Flesch Reading Ease score
+
+def _calculate_readability_metrics(metrics_df) -> Dict[str, float]:
+    # Extract the readability metrics and convert from numpy types to Python native types
+    flesch_reading_ease = float(metrics_df["flesch_reading_ease"].iloc[0])
+    flesch_kincaid_grade = float(metrics_df["flesch_kincaid_grade"].iloc[0])
+    gunning_fog = float(metrics_df["gunning_fog"].iloc[0])
+    coleman_liau_index = float(metrics_df["coleman_liau_index"].iloc[0])
+    
+    # Calculate average grade level
+    grade_levels = [flesch_kincaid_grade, gunning_fog, coleman_liau_index]
+    avg_grade_level = sum(grade_levels) / len(grade_levels)
+    
+    return {
+        "flesch_kincaid_grade": flesch_kincaid_grade,
+        "flesch_ease": flesch_reading_ease,
+        "gunning_fog_grade": gunning_fog,
+        "coleman_liau_grade": coleman_liau_index,
+        "avg_grade_level": avg_grade_level
+    }
+
+def get_assert(output: str) -> Union[bool, float, Dict[str, Any]]:
     print("=== TEXTDESCRIPTIVES READABILITY ASSESSMENT STARTING ===")
     print(f"Output to assess: {output}")
     
@@ -25,19 +44,10 @@ def get_assert(output: str, context) -> Union[bool, float, Dict[str, Any]]:
             metrics=["readability"]
         )
         
-        # Extract the readability metrics and convert from numpy types to Python native types
-        flesch_reading_ease = float(metrics_df["flesch_reading_ease"].iloc[0])
-        flesch_kincaid_grade = float(metrics_df["flesch_kincaid_grade"].iloc[0])
-        gunning_fog = float(metrics_df["gunning_fog"].iloc[0])
-        coleman_liau_index = float(metrics_df["coleman_liau_index"].iloc[0])
-        
-        # Set thresholds for readability
-        MAX_GRADE_LEVEL = 12.0  # Maximum acceptable grade level (high school)
-        MIN_FLESCH_EASE = 50.0  # Minimum acceptable Flesch Reading Ease score
-        
-        # Calculate average grade level from metrics
-        grade_levels = [flesch_kincaid_grade, gunning_fog, coleman_liau_index]
-        avg_grade_level = sum(grade_levels) / len(grade_levels)
+        # Get readability metrics
+        metrics = _calculate_readability_metrics(metrics_df)
+        flesch_reading_ease = metrics["flesch_ease"]
+        avg_grade_level = metrics["avg_grade_level"]
         
         # Determine if the text passes readability requirements
         passes_grade_level = bool(avg_grade_level <= MAX_GRADE_LEVEL)
@@ -50,23 +60,6 @@ def get_assert(output: str, context) -> Union[bool, float, Dict[str, Any]]:
         # Overall score is average of both metrics
         overall_score = float((grade_level_score + flesch_ease_score) / 2)
         
-        # Ensure all values are standard Python types, not numpy types
-        def numpy_to_python(obj):
-            if isinstance(obj, np.integer):
-                return int(obj)
-            elif isinstance(obj, np.floating):
-                return float(obj)
-            elif isinstance(obj, np.ndarray):
-                return obj.tolist()
-            elif isinstance(obj, np.bool_):
-                return bool(obj)
-            elif isinstance(obj, dict):
-                return {k: numpy_to_python(v) for k, v in obj.items()}
-            elif isinstance(obj, list):
-                return [numpy_to_python(i) for i in obj]
-            else:
-                return obj
-        
         # Return comprehensive grading result
         result = {
             'pass': passes_grade_level and passes_flesch_ease,
@@ -84,25 +77,16 @@ def numpy_to_python(obj):
                     'reason': f'Flesch Reading Ease (target ≥ {MIN_FLESCH_EASE}): {flesch_reading_ease:.1f}'
                 }
             ],
-            'namedScores': {
-                'flesch_kincaid_grade': flesch_kincaid_grade,
-                'flesch_ease': flesch_reading_ease,
-                'gunning_fog_grade': gunning_fog,
-                'coleman_liau_grade': coleman_liau_index,
-                'avg_grade_level': avg_grade_level
-            }
+            'namedScores': metrics
         }
         
-        # Convert any remaining numpy types to Python native types
-        result = numpy_to_python(result)
-        
         print("Assessment result:", result)
         return result
         
     except Exception as e:
-        print(f"Error in readability assessment: {str(e)}")
+        print(f"Error in readability assessment: {e}")
         return {
             'pass': False,
-            'score': 0.0,
-            'reason': f'Error in readability assessment: {str(e)}'
+            'score': -1.0,  # Negative score indicates error processing input
+            'reason': f'Error in readability assessment: {e}'
         } 
\ No newline at end of file

From a1ba504be23db3b53e31d3affa366487586fe09a Mon Sep 17 00:00:00 2001
From: fg-nava <189638926+fg-nava@users.noreply.github.com>
Date: Thu, 22 May 2025 09:01:27 -0700
Subject: [PATCH 6/6] fix: add back required context param

---
 app/promptfoo/readability_assessment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/promptfoo/readability_assessment.py b/app/promptfoo/readability_assessment.py
index 19bca489..788c9c91 100644
--- a/app/promptfoo/readability_assessment.py
+++ b/app/promptfoo/readability_assessment.py
@@ -25,7 +25,7 @@ def _calculate_readability_metrics(metrics_df) -> Dict[str, float]:
         "avg_grade_level": avg_grade_level
     }
 
-def get_assert(output: str) -> Union[bool, float, Dict[str, Any]]:
+def get_assert(output: str, context: Any = None) -> Union[bool, float, Dict[str, Any]]:
     print("=== TEXTDESCRIPTIVES READABILITY ASSESSMENT STARTING ===")
     print(f"Output to assess: {output}")