From 1779f26d4bbd069239a2767fd44e700bc40d2acf Mon Sep 17 00:00:00 2001 From: fg-nava <189638926+fg-nava@users.noreply.github.com> Date: Tue, 20 May 2025 14:01:45 -0700 Subject: [PATCH 1/6] feat: Add readability python assertion using TextDescriptives --- .../promptfoo-googlesheet-evaluation.yml | 11 +- app/promptfoo/promptfooconfig.ci.yaml | 3 + app/promptfoo/readability_assessment.py | 108 ++++++++++++++++++ 3 files changed, 120 insertions(+), 2 deletions(-) create mode 100644 app/promptfoo/readability_assessment.py diff --git a/.github/workflows/promptfoo-googlesheet-evaluation.yml b/.github/workflows/promptfoo-googlesheet-evaluation.yml index 85252fc3..d51a6522 100644 --- a/.github/workflows/promptfoo-googlesheet-evaluation.yml +++ b/.github/workflows/promptfoo-googlesheet-evaluation.yml @@ -10,6 +10,7 @@ on: - 'app/src/generate.py' - 'app/promptfoo/promptfooconfig.ci.yaml' - 'app/promptfoo/generateUniqueId.js' + - 'app/promptfoo/readability_assessment.py' - '.github/workflows/promptfoo-googlesheet-evaluation.yml' workflow_dispatch: inputs: @@ -54,7 +55,12 @@ jobs: - name: Install system dependencies run: | sudo apt-get update - sudo apt-get install -y jq gettext + sudo apt-get install -y jq gettext python3-pip + + - name: Install Python dependencies + run: | + python3 -m pip install textdescriptives spacy + python3 -m spacy download en_core_web_sm - name: Set up Google Cloud credentials run: | @@ -77,6 +83,7 @@ jobs: - name: Process config file run: | cp app/promptfoo/generateUniqueId.js /tmp/generateUniqueId.js + cp app/promptfoo/readability_assessment.py /tmp/readability_assessment.py envsubst < app/promptfoo/promptfooconfig.ci.yaml > /tmp/promptfooconfig.processed.yaml echo "Config file processed, checking..." grep -v "GOOGLE_SHEET\|CHATBOT_INSTANCE" /tmp/promptfooconfig.processed.yaml | grep -i "url\|path" @@ -134,7 +141,7 @@ jobs: fi - name: Create PR comment - if: github.event_name == 'pull_request' + if: github.event.name == 'pull_request' uses: actions/github-script@v7 with: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/app/promptfoo/promptfooconfig.ci.yaml b/app/promptfoo/promptfooconfig.ci.yaml index e2673be8..7e0e6ffd 100644 --- a/app/promptfoo/promptfooconfig.ci.yaml +++ b/app/promptfoo/promptfooconfig.ci.yaml @@ -22,6 +22,9 @@ defaultTest: uniqueSessionId: file:///tmp/generateUniqueId.js options: timeout: 360000 + assert: + - type: python + value: file://tmp/readability_assessment.py evaluateOptions: delay: 1000 diff --git a/app/promptfoo/readability_assessment.py b/app/promptfoo/readability_assessment.py new file mode 100644 index 00000000..54bfa2e8 --- /dev/null +++ b/app/promptfoo/readability_assessment.py @@ -0,0 +1,108 @@ +from typing import Dict, Union, Any +import textdescriptives as td +import numpy as np + +def get_assert(output: str, context) -> Union[bool, float, Dict[str, Any]]: + """ + Assess the readability of the output text using TextDescriptives instead of py-readability-metrics. + Returns a GradingResult with component scores for different readability metrics. + """ + print("=== TEXTDESCRIPTIVES READABILITY ASSESSMENT STARTING ===") + print(f"Output to assess: {output}") + + try: + if not output or len(output.strip()) == 0: + return { + 'pass': False, + 'score': 0.0, + 'reason': 'Empty or invalid output text' + } + + # Use TextDescriptives to calculate readability metrics + metrics_df = td.extract_metrics( + text=output, + spacy_model="en_core_web_sm", + metrics=["readability"] + ) + + # Extract the readability metrics and convert from numpy types to Python native types + flesch_reading_ease = float(metrics_df["flesch_reading_ease"].iloc[0]) + flesch_kincaid_grade = float(metrics_df["flesch_kincaid_grade"].iloc[0]) + gunning_fog = float(metrics_df["gunning_fog"].iloc[0]) + coleman_liau_index = float(metrics_df["coleman_liau_index"].iloc[0]) + + # Set thresholds for readability + MAX_GRADE_LEVEL = 12.0 # Maximum acceptable grade level (high school) + MIN_FLESCH_EASE = 50.0 # Minimum acceptable Flesch Reading Ease score + + # Calculate average grade level from metrics + grade_levels = [flesch_kincaid_grade, gunning_fog, coleman_liau_index] + avg_grade_level = sum(grade_levels) / len(grade_levels) + + # Determine if the text passes readability requirements + passes_grade_level = bool(avg_grade_level <= MAX_GRADE_LEVEL) + passes_flesch_ease = bool(flesch_reading_ease >= MIN_FLESCH_EASE) + + # Calculate normalized score (0-1) + grade_level_score = float(max(0, 1 - (avg_grade_level / (MAX_GRADE_LEVEL * 1.5)))) + flesch_ease_score = float(flesch_reading_ease / 100.0) + + # Overall score is average of both metrics + overall_score = float((grade_level_score + flesch_ease_score) / 2) + + # Ensure all values are standard Python types, not numpy types + def numpy_to_python(obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, np.bool_): + return bool(obj) + elif isinstance(obj, dict): + return {k: numpy_to_python(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [numpy_to_python(i) for i in obj] + else: + return obj + + # Return comprehensive grading result + result = { + 'pass': passes_grade_level and passes_flesch_ease, + 'score': overall_score, + 'reason': f'Readability assessment: Average grade level: {avg_grade_level:.1f}, Flesch ease: {flesch_reading_ease:.1f}', + 'componentResults': [ + { + 'pass': passes_grade_level, + 'score': grade_level_score, + 'reason': f'Grade Level (target ≤ {MAX_GRADE_LEVEL}): {avg_grade_level:.1f}' + }, + { + 'pass': passes_flesch_ease, + 'score': flesch_ease_score, + 'reason': f'Flesch Reading Ease (target ≥ {MIN_FLESCH_EASE}): {flesch_reading_ease:.1f}' + } + ], + 'namedScores': { + 'flesch_kincaid_grade': flesch_kincaid_grade, + 'flesch_ease': flesch_reading_ease, + 'gunning_fog_grade': gunning_fog, + 'coleman_liau_grade': coleman_liau_index, + 'avg_grade_level': avg_grade_level + } + } + + # Convert any remaining numpy types to Python native types + result = numpy_to_python(result) + + print("Assessment result:", result) + return result + + except Exception as e: + print(f"Error in readability assessment: {str(e)}") + return { + 'pass': False, + 'score': 0.0, + 'reason': f'Error in readability assessment: {str(e)}' + } \ No newline at end of file From d12b80d46240485aa5f7ea114eec85e8dc430932 Mon Sep 17 00:00:00 2001 From: fg-nava <189638926+fg-nava@users.noreply.github.com> Date: Tue, 20 May 2025 14:17:02 -0700 Subject: [PATCH 2/6] fix: path/to/ the readbility file --- app/promptfoo/promptfooconfig.ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/promptfoo/promptfooconfig.ci.yaml b/app/promptfoo/promptfooconfig.ci.yaml index 7e0e6ffd..2b6f0d13 100644 --- a/app/promptfoo/promptfooconfig.ci.yaml +++ b/app/promptfoo/promptfooconfig.ci.yaml @@ -24,7 +24,7 @@ defaultTest: timeout: 360000 assert: - type: python - value: file://tmp/readability_assessment.py + value: file:///tmp/readability_assessment.py evaluateOptions: delay: 1000 From bf17db63ec4a53f7ca60c6f7440f1ca7fd0c5cf1 Mon Sep 17 00:00:00 2001 From: fg-nava <189638926+fg-nava@users.noreply.github.com> Date: Tue, 20 May 2025 15:06:52 -0700 Subject: [PATCH 3/6] fix: only run python assertion for tagged questions --- app/promptfoo/promptfooconfig.ci.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/app/promptfoo/promptfooconfig.ci.yaml b/app/promptfoo/promptfooconfig.ci.yaml index 2b6f0d13..e2673be8 100644 --- a/app/promptfoo/promptfooconfig.ci.yaml +++ b/app/promptfoo/promptfooconfig.ci.yaml @@ -22,9 +22,6 @@ defaultTest: uniqueSessionId: file:///tmp/generateUniqueId.js options: timeout: 360000 - assert: - - type: python - value: file:///tmp/readability_assessment.py evaluateOptions: delay: 1000 From 2cea951097552678d18ac2c63110fbdbf376efe0 Mon Sep 17 00:00:00 2001 From: fg-nava <189638926+fg-nava@users.noreply.github.com> Date: Tue, 20 May 2025 15:25:41 -0700 Subject: [PATCH 4/6] fix: PR comment when event is pull_request --- .github/workflows/promptfoo-googlesheet-evaluation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/promptfoo-googlesheet-evaluation.yml b/.github/workflows/promptfoo-googlesheet-evaluation.yml index d51a6522..062a40fa 100644 --- a/.github/workflows/promptfoo-googlesheet-evaluation.yml +++ b/.github/workflows/promptfoo-googlesheet-evaluation.yml @@ -141,7 +141,7 @@ jobs: fi - name: Create PR comment - if: github.event.name == 'pull_request' + if: github.event_name == 'pull_request' uses: actions/github-script@v7 with: github-token: ${{ secrets.GITHUB_TOKEN }} From 29b1b06e8fa1fd5cf953f0d1e14c2a8208e54d7f Mon Sep 17 00:00:00 2001 From: fg-nava <189638926+fg-nava@users.noreply.github.com> Date: Thu, 22 May 2025 08:46:34 -0700 Subject: [PATCH 5/6] fix: add solutions to YLs comments --- app/promptfoo/readability_assessment.py | 80 ++++++++++--------------- 1 file changed, 32 insertions(+), 48 deletions(-) diff --git a/app/promptfoo/readability_assessment.py b/app/promptfoo/readability_assessment.py index 54bfa2e8..19bca489 100644 --- a/app/promptfoo/readability_assessment.py +++ b/app/promptfoo/readability_assessment.py @@ -2,11 +2,30 @@ import textdescriptives as td import numpy as np -def get_assert(output: str, context) -> Union[bool, float, Dict[str, Any]]: - """ - Assess the readability of the output text using TextDescriptives instead of py-readability-metrics. - Returns a GradingResult with component scores for different readability metrics. - """ +# Readability thresholds +MAX_GRADE_LEVEL = 12.0 # Maximum acceptable grade level (high school) +MIN_FLESCH_EASE = 50.0 # Minimum acceptable Flesch Reading Ease score + +def _calculate_readability_metrics(metrics_df) -> Dict[str, float]: + # Extract the readability metrics and convert from numpy types to Python native types + flesch_reading_ease = float(metrics_df["flesch_reading_ease"].iloc[0]) + flesch_kincaid_grade = float(metrics_df["flesch_kincaid_grade"].iloc[0]) + gunning_fog = float(metrics_df["gunning_fog"].iloc[0]) + coleman_liau_index = float(metrics_df["coleman_liau_index"].iloc[0]) + + # Calculate average grade level + grade_levels = [flesch_kincaid_grade, gunning_fog, coleman_liau_index] + avg_grade_level = sum(grade_levels) / len(grade_levels) + + return { + "flesch_kincaid_grade": flesch_kincaid_grade, + "flesch_ease": flesch_reading_ease, + "gunning_fog_grade": gunning_fog, + "coleman_liau_grade": coleman_liau_index, + "avg_grade_level": avg_grade_level + } + +def get_assert(output: str) -> Union[bool, float, Dict[str, Any]]: print("=== TEXTDESCRIPTIVES READABILITY ASSESSMENT STARTING ===") print(f"Output to assess: {output}") @@ -25,19 +44,10 @@ def get_assert(output: str, context) -> Union[bool, float, Dict[str, Any]]: metrics=["readability"] ) - # Extract the readability metrics and convert from numpy types to Python native types - flesch_reading_ease = float(metrics_df["flesch_reading_ease"].iloc[0]) - flesch_kincaid_grade = float(metrics_df["flesch_kincaid_grade"].iloc[0]) - gunning_fog = float(metrics_df["gunning_fog"].iloc[0]) - coleman_liau_index = float(metrics_df["coleman_liau_index"].iloc[0]) - - # Set thresholds for readability - MAX_GRADE_LEVEL = 12.0 # Maximum acceptable grade level (high school) - MIN_FLESCH_EASE = 50.0 # Minimum acceptable Flesch Reading Ease score - - # Calculate average grade level from metrics - grade_levels = [flesch_kincaid_grade, gunning_fog, coleman_liau_index] - avg_grade_level = sum(grade_levels) / len(grade_levels) + # Get readability metrics + metrics = _calculate_readability_metrics(metrics_df) + flesch_reading_ease = metrics["flesch_ease"] + avg_grade_level = metrics["avg_grade_level"] # Determine if the text passes readability requirements passes_grade_level = bool(avg_grade_level <= MAX_GRADE_LEVEL) @@ -50,23 +60,6 @@ def get_assert(output: str, context) -> Union[bool, float, Dict[str, Any]]: # Overall score is average of both metrics overall_score = float((grade_level_score + flesch_ease_score) / 2) - # Ensure all values are standard Python types, not numpy types - def numpy_to_python(obj): - if isinstance(obj, np.integer): - return int(obj) - elif isinstance(obj, np.floating): - return float(obj) - elif isinstance(obj, np.ndarray): - return obj.tolist() - elif isinstance(obj, np.bool_): - return bool(obj) - elif isinstance(obj, dict): - return {k: numpy_to_python(v) for k, v in obj.items()} - elif isinstance(obj, list): - return [numpy_to_python(i) for i in obj] - else: - return obj - # Return comprehensive grading result result = { 'pass': passes_grade_level and passes_flesch_ease, @@ -84,25 +77,16 @@ def numpy_to_python(obj): 'reason': f'Flesch Reading Ease (target ≥ {MIN_FLESCH_EASE}): {flesch_reading_ease:.1f}' } ], - 'namedScores': { - 'flesch_kincaid_grade': flesch_kincaid_grade, - 'flesch_ease': flesch_reading_ease, - 'gunning_fog_grade': gunning_fog, - 'coleman_liau_grade': coleman_liau_index, - 'avg_grade_level': avg_grade_level - } + 'namedScores': metrics } - # Convert any remaining numpy types to Python native types - result = numpy_to_python(result) - print("Assessment result:", result) return result except Exception as e: - print(f"Error in readability assessment: {str(e)}") + print(f"Error in readability assessment: {e}") return { 'pass': False, - 'score': 0.0, - 'reason': f'Error in readability assessment: {str(e)}' + 'score': -1.0, # Negative score indicates error processing input + 'reason': f'Error in readability assessment: {e}' } \ No newline at end of file From a1ba504be23db3b53e31d3affa366487586fe09a Mon Sep 17 00:00:00 2001 From: fg-nava <189638926+fg-nava@users.noreply.github.com> Date: Thu, 22 May 2025 09:01:27 -0700 Subject: [PATCH 6/6] fix: add back required context param --- app/promptfoo/readability_assessment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/promptfoo/readability_assessment.py b/app/promptfoo/readability_assessment.py index 19bca489..788c9c91 100644 --- a/app/promptfoo/readability_assessment.py +++ b/app/promptfoo/readability_assessment.py @@ -25,7 +25,7 @@ def _calculate_readability_metrics(metrics_df) -> Dict[str, float]: "avg_grade_level": avg_grade_level } -def get_assert(output: str) -> Union[bool, float, Dict[str, Any]]: +def get_assert(output: str, context: Any = None) -> Union[bool, float, Dict[str, Any]]: print("=== TEXTDESCRIPTIVES READABILITY ASSESSMENT STARTING ===") print(f"Output to assess: {output}")