Skip to content

ci: Add readability assessment to promptfoo GHA workflow #313

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 22, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .github/workflows/promptfoo-googlesheet-evaluation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ on:
- 'app/src/generate.py'
- 'app/promptfoo/promptfooconfig.ci.yaml'
- 'app/promptfoo/generateUniqueId.js'
- 'app/promptfoo/readability_assessment.py'
- '.github/workflows/promptfoo-googlesheet-evaluation.yml'
workflow_dispatch:
inputs:
Expand Down Expand Up @@ -54,7 +55,12 @@ jobs:
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y jq gettext
sudo apt-get install -y jq gettext python3-pip

- name: Install Python dependencies
run: |
python3 -m pip install textdescriptives spacy
python3 -m spacy download en_core_web_sm

- name: Set up Google Cloud credentials
run: |
Expand All @@ -77,6 +83,7 @@ jobs:
- name: Process config file
run: |
cp app/promptfoo/generateUniqueId.js /tmp/generateUniqueId.js
cp app/promptfoo/readability_assessment.py /tmp/readability_assessment.py
envsubst < app/promptfoo/promptfooconfig.ci.yaml > /tmp/promptfooconfig.processed.yaml
echo "Config file processed, checking..."
grep -v "GOOGLE_SHEET\|CHATBOT_INSTANCE" /tmp/promptfooconfig.processed.yaml | grep -i "url\|path"
Expand Down
108 changes: 108 additions & 0 deletions app/promptfoo/readability_assessment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from typing import Dict, Union, Any
import textdescriptives as td
import numpy as np

def get_assert(output: str, context) -> Union[bool, float, Dict[str, Any]]:
"""
Assess the readability of the output text using TextDescriptives instead of py-readability-metrics.
Returns a GradingResult with component scores for different readability metrics.
"""
print("=== TEXTDESCRIPTIVES READABILITY ASSESSMENT STARTING ===")
print(f"Output to assess: {output}")

try:
if not output or len(output.strip()) == 0:
return {
'pass': False,
'score': 0.0,
'reason': 'Empty or invalid output text'
}

# Use TextDescriptives to calculate readability metrics
metrics_df = td.extract_metrics(
text=output,
spacy_model="en_core_web_sm",
metrics=["readability"]
)

# Extract the readability metrics and convert from numpy types to Python native types
flesch_reading_ease = float(metrics_df["flesch_reading_ease"].iloc[0])
flesch_kincaid_grade = float(metrics_df["flesch_kincaid_grade"].iloc[0])
gunning_fog = float(metrics_df["gunning_fog"].iloc[0])
coleman_liau_index = float(metrics_df["coleman_liau_index"].iloc[0])

# Set thresholds for readability
MAX_GRADE_LEVEL = 12.0 # Maximum acceptable grade level (high school)
MIN_FLESCH_EASE = 50.0 # Minimum acceptable Flesch Reading Ease score

# Calculate average grade level from metrics
grade_levels = [flesch_kincaid_grade, gunning_fog, coleman_liau_index]
avg_grade_level = sum(grade_levels) / len(grade_levels)

# Determine if the text passes readability requirements
passes_grade_level = bool(avg_grade_level <= MAX_GRADE_LEVEL)
passes_flesch_ease = bool(flesch_reading_ease >= MIN_FLESCH_EASE)

# Calculate normalized score (0-1)
grade_level_score = float(max(0, 1 - (avg_grade_level / (MAX_GRADE_LEVEL * 1.5))))
flesch_ease_score = float(flesch_reading_ease / 100.0)

# Overall score is average of both metrics
overall_score = float((grade_level_score + flesch_ease_score) / 2)

# Ensure all values are standard Python types, not numpy types
def numpy_to_python(obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, np.bool_):
return bool(obj)
elif isinstance(obj, dict):
return {k: numpy_to_python(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [numpy_to_python(i) for i in obj]
else:
return obj

# Return comprehensive grading result
result = {
'pass': passes_grade_level and passes_flesch_ease,
'score': overall_score,
'reason': f'Readability assessment: Average grade level: {avg_grade_level:.1f}, Flesch ease: {flesch_reading_ease:.1f}',
'componentResults': [
{
'pass': passes_grade_level,
'score': grade_level_score,
'reason': f'Grade Level (target ≤ {MAX_GRADE_LEVEL}): {avg_grade_level:.1f}'
},
{
'pass': passes_flesch_ease,
'score': flesch_ease_score,
'reason': f'Flesch Reading Ease (target ≥ {MIN_FLESCH_EASE}): {flesch_reading_ease:.1f}'
}
],
'namedScores': {
'flesch_kincaid_grade': flesch_kincaid_grade,
'flesch_ease': flesch_reading_ease,
'gunning_fog_grade': gunning_fog,
'coleman_liau_grade': coleman_liau_index,
'avg_grade_level': avg_grade_level
}
}

# Convert any remaining numpy types to Python native types
result = numpy_to_python(result)

print("Assessment result:", result)
return result

except Exception as e:
print(f"Error in readability assessment: {str(e)}")
return {
'pass': False,
'score': 0.0,
'reason': f'Error in readability assessment: {str(e)}'
}