From 93c27b5a37eadcbd7fa5709f152b98384d4614f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Miko=C5=82ajczak?= Date: Fri, 17 Oct 2025 13:52:53 +0200 Subject: [PATCH 1/3] v1 --- ai_review_error_reporter.sh | 71 +++++++++++++++++++ claude_review_collector.sh | 136 ++++++++++++++++++++++++++++++++++++ job_error_aggregator.sh | 74 ++++++++++++++++++++ 3 files changed, 281 insertions(+) create mode 100755 ai_review_error_reporter.sh create mode 100755 claude_review_collector.sh create mode 100755 job_error_aggregator.sh diff --git a/ai_review_error_reporter.sh b/ai_review_error_reporter.sh new file mode 100755 index 000000000000..2d56fa964817 --- /dev/null +++ b/ai_review_error_reporter.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# Orchestrator script to collect and analyze AI review errors +# Usage: ./ai_review_error_reporter.sh [runs_limit] [output_file] +# runs_limit: Number of workflow runs to fetch (default: 100) +# output_file: Output file for the error report (default: job_error_report.md) +# +# Required environment variables: +# GITHUB_USER_SESSION: GitHub user session cookie for authenticated requests +# +# Required tools: +# - gh (GitHub CLI): Must be authenticated +# - jq: JSON processor +# - curl: HTTP client + +set -e # Exit on any error + +# Get script directory (where this script is located) +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +# Parameters +RUNS_LIMIT="${1:-100}" +OUTPUT_FILE="${2:-job_error_report.md}" + +echo "========================================" +echo "AI Review Error Reporter" +echo "========================================" +echo "" +echo "Configuration:" +echo " Workflow runs limit: $RUNS_LIMIT" +echo " Output report file: $OUTPUT_FILE" +echo "" + +# Step 1: Collect workflow run data +echo "========================================" +echo "Step 1: Collecting workflow run data" +echo "========================================" +echo "" + +if [ ! -x "$SCRIPT_DIR/claude_review_collector.sh" ]; then + echo "Error: claude_review_collector.sh not found or not executable" + exit 1 +fi + +"$SCRIPT_DIR/claude_review_collector.sh" "$RUNS_LIMIT" + +echo "" +echo "✅ Data collection completed" +echo "" + +# Step 2: Aggregate and analyze errors +echo "========================================" +echo "Step 2: Aggregating and analyzing errors" +echo "========================================" +echo "" + +if [ ! -x "$SCRIPT_DIR/job_error_aggregator.sh" ]; then + echo "Error: job_error_aggregator.sh not found or not executable" + exit 1 +fi + +"$SCRIPT_DIR/job_error_aggregator.sh" "job_errors_titles" "$OUTPUT_FILE" + +echo "" +echo "========================================" +echo "✅ Report generation completed!" +echo "========================================" +echo "" +echo "📊 Report saved to: $OUTPUT_FILE" +echo "" + diff --git a/claude_review_collector.sh b/claude_review_collector.sh new file mode 100755 index 000000000000..f0ea19e00f65 --- /dev/null +++ b/claude_review_collector.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +# Script to collect and cache claude-review workflow run data +# Usage: ./claude_review_collector.sh [limit] +# limit: Number of workflow runs to fetch (default: 100) +# +# Required environment variables: +# GITHUB_USER_SESSION: GitHub user session cookie for authenticated requests +# +# Note: GitHub CLI (gh) must be authenticated for API requests + +WORKFLOW="claude-review.yml" + +# Validate required environment variables +if [ -z "$GITHUB_USER_SESSION" ]; then + echo "Error: GITHUB_USER_SESSION environment variable is not set" + echo "Please set it with: export GITHUB_USER_SESSION='your_session_token'" + exit 1 +fi + +# Workflow runs limit (default: 100, can be overridden by first argument) +RUNS_LIMIT="${1:-100}" + +# Cache directories +JOB_SUMMARY_URL_DIR="job_summary_url" +JOB_SUMMARY_MD_DIR="job_summary_md" +JOB_ERRORS_DIR="job_errors" +JOB_ERRORS_TITLES_DIR="job_errors_titles" + +# Create cache directories if they don't exist +mkdir -p "$JOB_SUMMARY_URL_DIR" +mkdir -p "$JOB_SUMMARY_MD_DIR" +mkdir -p "$JOB_ERRORS_DIR" +mkdir -p "$JOB_ERRORS_TITLES_DIR" + +# Get run ids for the workflow +echo "Fetching up to $RUNS_LIMIT workflow runs for '$WORKFLOW'..." +RUNS=$(gh run list --workflow "$WORKFLOW" --status success --json databaseId,url --limit "$RUNS_LIMIT") + +# Count total runs +TOTAL_RUNS=$(echo "$RUNS" | jq '. | length') +echo "Found $TOTAL_RUNS workflow run(s)" +echo "" + +# Iterate over each run and make a curl request to /attempts/1 +echo "$RUNS" | jq -r '.[] | .url' | while read -r run_url; do + run_id=$(basename "$run_url") + + # Define cache file paths + url_cache_file="${JOB_SUMMARY_URL_DIR}/${run_id}.txt" + md_cache_file="${JOB_SUMMARY_MD_DIR}/${run_id}.md" + errors_cache_file="${JOB_ERRORS_DIR}/${run_id}.md" + errors_titles_cache_file="${JOB_ERRORS_TITLES_DIR}/${run_id}.txt" + + echo "Processing run: ${run_id}" + + # Check if job_summary_path is cached + if [ -f "$url_cache_file" ]; then + echo " Using cached job_summary_path" + job_summary_path=$(cat "$url_cache_file") + else + echo " Fetching run page: ${run_url}" + html_content=$(curl -s -L -H "Cookie: user_session=$GITHUB_USER_SESSION" "$run_url") + + # Extract job summary path + job_summary_path=$(echo "$html_content" | grep -oE -m 1 "/Expensify/App/actions/runs/${run_id}/jobs/[0-9]+/summary_raw" | head -1) + + if [ -z "$job_summary_path" ]; then + echo " No job summary found, skipping..." + continue + fi + + # Cache the job_summary_path + echo "$job_summary_path" > "$url_cache_file" + echo " Cached job_summary_path" + fi + + job_summary_url="https://github.com${job_summary_path}" + + # Check if job_summary_content is cached + if [ -f "$md_cache_file" ]; then + echo " Using cached job_summary_content" + job_summary_content=$(cat "$md_cache_file") + else + echo " Fetching job summary: ${job_summary_url}" + job_summary_content=$(curl -s -L -H "Cookie: user_session=$GITHUB_USER_SESSION" "$job_summary_url") + + # Cache the job_summary_content + echo "$job_summary_content" > "$md_cache_file" + echo " Cached job_summary_content" + fi + + # Extract error blocks (content between --- delimiters) and cache them + if [ ! -f "$errors_cache_file" ]; then + # Use awk to extract blocks containing errors between --- delimiters + error_blocks=$(echo "$job_summary_content" | awk ' + BEGIN { in_block=0; block="" } + /^---$/ { + if (in_block && block ~ /❌ \*\*Error:\*\*/) { + print block "---" + } + in_block=1 + block="---\n" + next + } + in_block { block = block $0 "\n" } + ') + + if [ -n "$error_blocks" ]; then + echo "$error_blocks" > "$errors_cache_file" + echo " Cached errors ($(echo "$error_blocks" | grep -c "^---$")/2 blocks)" + else + echo " No errors found" + touch "$errors_cache_file" + fi + else + echo " Errors already cached" + fi + + # Extract error titles from cached errors and cache them + if [ ! -f "$errors_titles_cache_file" ]; then + if [ -s "$errors_cache_file" ]; then + error_titles=$(grep "^❌ \*\*Error:\*\*" "$errors_cache_file") + if [ -n "$error_titles" ]; then + echo "$error_titles" > "$errors_titles_cache_file" + echo " Cached error titles ($(echo "$error_titles" | wc -l | xargs) titles)" + else + touch "$errors_titles_cache_file" + fi + else + touch "$errors_titles_cache_file" + fi + else + echo " Error titles already cached" + fi +done \ No newline at end of file diff --git a/job_error_aggregator.sh b/job_error_aggregator.sh new file mode 100755 index 000000000000..eb544be5f365 --- /dev/null +++ b/job_error_aggregator.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# Script to aggregate error titles from job_errors_titles directory +# Usage: ./job_error_aggregator.sh [directory] [output_file] + +# Default directory or use first argument +ERROR_TITLES_DIR="${1:-job_errors_titles}" +OUTPUT_FILE="${2:-job_error_report.md}" + +# Check if directory exists +if [ ! -d "$ERROR_TITLES_DIR" ]; then + echo "Error: Directory '$ERROR_TITLES_DIR' does not exist" + exit 1 +fi + +# Check if directory has any files +file_count=$(find "$ERROR_TITLES_DIR" -type f | wc -l | xargs) +if [ "$file_count" -eq 0 ]; then + echo "Error: No files found in '$ERROR_TITLES_DIR'" + exit 1 +fi + +echo "Aggregating errors from $file_count files in '$ERROR_TITLES_DIR'..." +echo "Output will be saved to: $OUTPUT_FILE" +echo "" + +# Create a temporary file to store all error messages +temp_file=$(mktemp) + +# Read all files and extract error messages (removing the ❌ **Error:** prefix) +for file in "$ERROR_TITLES_DIR"/*.txt; do + if [ -f "$file" ] && [ -s "$file" ]; then + # Extract the error message part after "❌ **Error:** " + sed 's/^❌ \*\*Error:\*\* //' "$file" >> "$temp_file" + fi +done + +# Calculate summary statistics +total_errors=$(wc -l < "$temp_file" | xargs) +unique_errors=$(sort "$temp_file" | uniq | wc -l | xargs) + +# Generate report header +{ + echo "# Error Frequency Report" + echo "" + echo "**Generated:** $(date '+%Y-%m-%d %H:%M:%S')" + echo "**Source Directory:** \`$ERROR_TITLES_DIR\`" + echo "**Files Processed:** $file_count" + echo "" + echo "## Summary" + echo "" + echo "- **Total errors:** $total_errors" + echo "- **Unique errors:** $unique_errors" + echo "" + echo "## Error Breakdown" + echo "" + echo "| Count | Error Message |" + echo "|------:|---------------|" + + # Sort, count unique lines, sort by count (descending), and format as table + sort "$temp_file" | uniq -c | sort -rn | while read -r count error; do + printf "| %d | \`%s\` |\n" "$count" "$error" + done +} > "$OUTPUT_FILE" + +# Display the report on console +cat "$OUTPUT_FILE" + +echo "" +echo "✅ Report saved to: $OUTPUT_FILE" + +# Cleanup +rm "$temp_file" + From c2b0e6e3d47bc60bc9354cb1c5319eed29827a67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Miko=C5=82ajczak?= Date: Fri, 17 Oct 2025 14:26:56 +0200 Subject: [PATCH 2/3] add script readme --- AI_REVIEW_ERROR_REPORTER_README.md | 510 +++++++++++++++++++++++++++++ 1 file changed, 510 insertions(+) create mode 100644 AI_REVIEW_ERROR_REPORTER_README.md diff --git a/AI_REVIEW_ERROR_REPORTER_README.md b/AI_REVIEW_ERROR_REPORTER_README.md new file mode 100644 index 000000000000..8935cc84e128 --- /dev/null +++ b/AI_REVIEW_ERROR_REPORTER_README.md @@ -0,0 +1,510 @@ +# AI Review Error Reporter + +A collection of scripts to analyze and aggregate errors from GitHub Actions workflow runs for the `claude-review.yml` workflow. + +## Overview + +This toolset helps identify and track common errors in AI review workflows by: +1. Collecting workflow run data from GitHub +2. Extracting error messages from job summaries +3. Aggregating and ranking errors by frequency + +## Scripts + +### 1. `ai_review_error_reporter.sh` (Main Orchestrator) +Main entry point that runs the entire pipeline. + +**Usage:** +```bash +./ai_review_error_reporter.sh [runs_limit] [output_file] +``` + +**Parameters:** +- `runs_limit`: Number of workflow runs to fetch (default: 100) +- `output_file`: Output file for the error report (default: `job_error_report.md`) + +**Example:** +```bash +./ai_review_error_reporter.sh 50 my_report.md +``` + +### 2. `claude_review_collector.sh` (Data Collection) +Fetches workflow run data from GitHub and caches it locally. + +**Usage:** +```bash +./claude_review_collector.sh [limit] +``` + +**Parameters:** +- `limit`: Number of workflow runs to fetch (default: 100) + +### 3. `job_error_aggregator.sh` (Analysis) +Aggregates and ranks error messages from cached data. + +**Usage:** +```bash +./job_error_aggregator.sh [directory] [output_file] +``` + +**Parameters:** +- `directory`: Directory containing error title files (default: `job_errors_titles`) +- `output_file`: Output markdown report (default: `job_error_report.md`) + +## Setup + +### Prerequisites + +1. **GitHub CLI (`gh`)** - Must be installed and authenticated + ```bash + # Install (macOS) + brew install gh + + # Authenticate + gh auth login + ``` + +2. **jq** - JSON processor + ```bash + # Install (macOS) + brew install jq + ``` + +3. **curl** - Usually pre-installed on macOS/Linux + +### Environment Variables + +The scripts require the following environment variable: + +```bash +export GITHUB_USER_SESSION="your_github_session_token" +``` + +#### How to Get Your GitHub Session Token: + +1. Log in to GitHub in your browser +2. Open Developer Tools (F12 or Cmd+Opt+I) +3. Go to the "Application" or "Storage" tab +4. Find "Cookies" → "https://github.com" +5. Look for the `user_session` cookie +6. Copy its value + +**⚠️ Security Warning:** +- Never commit this token to version control +- Keep it in your local environment or a `.env` file (which should be gitignored) +- This token provides access to your GitHub account - treat it like a password + +#### Recommended: Use a `.env` file + +Create a `.env` file in the project root (make sure it's in `.gitignore`): + +```bash +# .env +GITHUB_USER_SESSION=your_session_token_here +``` + +Then source it before running the scripts: + +```bash +source .env +./ai_review_error_reporter.sh +``` + +## Output & Caching + +### Cache Directories + +The scripts create the following directories for caching (one file per workflow run): + +#### `job_summary_url/` +**Content:** Raw URLs to job summary pages +**Format:** Text files named `{run_id}.txt` +**Example:** `18589302302.txt` → `/Expensify/App/actions/runs/18589302302/jobs/41169259293/summary_raw` +**Purpose:** Avoids re-scraping HTML pages to find job summary URLs +**Use Case:** URL mapping for direct access to job summaries + +#### `job_summary_md/` +**Content:** Complete job summary markdown from GitHub +**Format:** Markdown files named `{run_id}.md` +**Example:** `18589302302.md` → Full AI review summary including errors, warnings, and statistics +**Purpose:** Preserves the complete context of each review +**Size:** Typically 5-50 KB per file +**Use Case:** Full historical record of all AI review outputs + +#### `job_errors/` +**Content:** Extracted error blocks from job summaries (between `---` delimiters) +**Format:** Markdown files named `{run_id}.md` +**Example:** Contains only the error sections with full context: +```markdown +--- +❌ **Error:** This command requires approval +**File:** src/libs/actions/Report.ts +**Details:** Command execution blocked by security policy +--- +``` +**Purpose:** Structured error data with surrounding context +**Use Case:** Contextual error analysis, pattern detection, debugging + +#### `job_errors_titles/` +**Content:** Just the error message titles (one per line) +**Format:** Text files named `{run_id}.txt` +**Example:** +``` +❌ **Error:** This command requires approval +❌ **Error:** could not determine current branch +``` +**Purpose:** Quick frequency analysis without full context +**Use Case:** Current aggregation reports, trend analysis + +### Cached Data for Advanced Analysis + +> **💡 Pro Tip:** The cached data in `job_errors/` and `job_summary_md/` contains rich contextual information beyond just error titles. This data can be leveraged for: +> +> - **Contextual Error Analysis:** Understanding which files, functions, or code patterns trigger specific errors +> - **Error Co-occurrence:** Identifying errors that frequently appear together in the same run +> - **Temporal Patterns:** Analyzing how errors evolve over time or correlate with code changes +> - **PR/Author Correlation:** Linking errors to specific pull requests or authors +> - **Error Classification:** Automatically categorizing errors by type (permissions, syntax, rate limits, etc.) +> - **Root Cause Analysis:** Tracing errors back to specific code changes using the full context +> - **Predictive Analysis:** Building models to predict potential errors before they occur +> +> The current aggregator focuses on frequency, but the cached data supports much deeper analysis. Consider building additional tools that parse `job_errors/` for contextual insights. + +### Final Report + +The final report (`job_error_report.md` by default) includes: +- **Generation timestamp** - When the report was created +- **Source information** - Which cache directory was analyzed +- **Files processed** - Number of workflow runs included +- **Summary statistics:** + - Total error count + - Unique error types +- **Frequency-ranked table** - Errors sorted by occurrence count +- **Error messages** - Full error text for each unique error + +Example report structure: +```markdown +# Error Frequency Report + +**Generated:** 2024-10-17 13:00:18 +**Source Directory:** `job_errors_titles` +**Files Processed:** 50 + +## Summary +- **Total errors:** 127 +- **Unique errors:** 15 + +## Error Breakdown +| Count | Error Message | +|------:|---------------| +| 45 | `This command requires approval` | +| 23 | `could not determine current branch` | +... +``` + +### Caching Behavior + +The scripts use aggressive caching to avoid redundant API calls: +- ✅ **Once a workflow run is processed, all its data is cached locally** +- ✅ **Re-running the scripts will use cached data for previously processed runs** +- ✅ **New runs are automatically fetched and added to the cache** +- ✅ **Cache persists across script invocations** (stored on disk) +- ⚠️ **To force a refresh of specific runs, delete their corresponding cache files** +- ⚠️ **To start fresh, delete entire cache directories** + +**Cache Efficiency:** +- First run (100 workflows): ~8-10 minutes +- Subsequent run (same 100): ~30 seconds (uses cache) +- Adding 10 new runs: ~1 minute (only fetches new data) + +## Usage Examples + +### Quick Analysis (Last 10 runs) +```bash +./ai_review_error_reporter.sh 10 +``` + +### Comprehensive Analysis (Last 200 runs) +```bash +./ai_review_error_reporter.sh 200 comprehensive_report.md +``` + +### Re-analyze Cached Data +```bash +# Just run the aggregator on already-cached data +./job_error_aggregator.sh job_errors_titles updated_report.md +``` + +## Known Issues & Limitations + +This section documents the current limitations and burning issues with the AI Review Error Reporter scripts. + +### 🔥 Critical Issues + +#### 1. **Authentication Method: Cookie-Based (Fragile)** +**Severity:** 🔴 High +**Impact:** Script fails when cookie expires (typically every 30 days) + +**Problem:** +- Uses `GITHUB_USER_SESSION` cookie which requires manual extraction from browser +- Cookie expires periodically, requiring manual renewal +- Not suitable for automated/CI environments +- Tied to a personal GitHub account + +**Workaround:** +```bash +# Manual process required: +# 1. Open GitHub in browser +# 2. Open DevTools > Application > Cookies +# 3. Find user_session cookie +# 4. Copy and export: +export GITHUB_USER_SESSION="new_cookie_value" +``` + +--- + +#### 2. **HTML Scraping Instead of Official API** +**Severity:** 🔴 High +**Impact:** Breaks if GitHub changes their HTML structure + +**Problem:** +- Scrapes HTML pages to find job summary URLs: + ```bash + grep -oE -m 1 "/Expensify/App/actions/runs/${run_id}/jobs/[0-9]+/summary_raw" + ``` +- Fragile: Any change to GitHub's HTML breaks the script +- Requires authentication cookies (can't use API tokens) +- Slower than direct API access +- Not officially supported by GitHub + +**Why:** Job summaries are not available through the official GitHub Actions API. + +**Better Solution Available:** +Make the reviewer output its logs to GH artifacts instead as well as in summary. Artifacts can be retrieved via official GH CLI. + +--- + +#### 3. **No Rate Limiting Handling** +**Severity:** 🟡 Medium +**Impact:** Script fails silently when hitting GitHub API rate limits + +**Problem:** +- No detection of rate limit status +- No retry logic when rate limited +- No waiting/backoff mechanism +- Silent failures that are hard to debug + +**Manifestation:** +```bash +# Runs fine for 50-60 requests, then: +Processing run: 12345678 + No job summary found, skipping... # Actually rate limited! +``` + +**Workaround:** Wait 1 hour and re-run (rate limits reset hourly) + +**Better Solution Available:** `claude_review_collector_ci.sh` includes rate limit monitoring and automatic waiting. See `CI_IMPROVEMENTS.md`. + +--- + +### ⚠️ Medium Priority Issues + +#### 4. **Hardcoded Repository Path** +**Severity:** 🟡 Medium +**Impact:** Can't easily use with other repositories + +**Problem:** +```bash +# Hardcoded in the regex: +/Expensify/App/actions/runs/${run_id}/jobs/[0-9]+/summary_raw +``` + +**Workaround:** Edit the script to change repository path + +--- + +#### 5. **Sequential Processing (Slow for Large Batches)** +**Severity:** 🟡 Medium +**Impact:** Takes ~10-15 minutes to process 100 workflow runs + +**Problem:** +- Processes one run at a time in a while loop +- Network latency multiplied by number of runs +- Could be parallelized for 5-10x speedup + +**Current Performance:** +- 10 runs: ~1 minute +- 50 runs: ~5 minutes +- 100 runs: ~10 minutes +- 500 runs: ~50 minutes + +**Potential Solution:** Implement parallel processing with `xargs -P` or GNU `parallel`. See `CI_IMPROVEMENTS.md` for implementation examples. + +--- + +#### 6. **No CI/CD Integration** +**Severity:** 🟡 Medium +**Impact:** Can't run automatically in GitHub Actions or other CI systems + +**Problem:** +- Requires manual cookie setup (not available in CI) +- No GitHub Actions workflow provided +- Can't leverage Actions cache for faster runs +- No automatic scheduling + +--- + +#### 7. **Limited Error Context in Reports** +**Severity:** 🟡 Medium +**Impact:** Hard to understand root causes from aggregated reports + +**Problem:** +- Current aggregator only shows error titles +- Loses valuable context from `job_errors/` directory: + - Which files triggered errors + - Which PRs/branches were involved + - Full error details and stack traces + - Temporal patterns + +**Example - What's Available:** +```markdown +| Count | Error Message | +|------:|---------------| +| 45 | `This command requires approval` | +``` + +**Example - What Could Be Available:** +```markdown +| Count | Error | Common Files | Common PRs | Trend | +|------:|-------|--------------|------------|-------| +| 45 | Command approval | src/libs/actions/*.ts | #12345, #12389 | ↑ +15% | +``` + +**Potential Solution:** Build enhanced analyzer that parses `job_errors/` for contextual data. See "Cached Data for Advanced Analysis" section above for ideas. + +--- + +### 📝 Minor Issues + +#### 8. **No Progress Indicators for Long Operations** +**Severity:** 🟢 Low +**Impact:** Appears stuck during long runs + +**Problem:** +```bash +Processing run: 18589302302 + Fetching job summary: https://github.com/... + # Appears frozen here for 5-10 seconds +``` + +**Workaround:** Be patient, check network activity + +**Potential Solution:** Add progress bars or timestamps to output + +--- + +#### 9. **Cache Directories Not in .gitignore** +**Severity:** 🟢 Low +**Impact:** Risk of committing large cache files + +**Problem:** +- Cache directories (`job_summary_url/`, `job_summary_md/`, etc.) not automatically ignored +- Could accidentally commit 100s of cached files +- Increases repository size + +**Solution:** Add to `.gitignore`: +```gitignore +# AI Review Error Reporter cache +/job_summary_url/ +/job_summary_md/ +/job_errors/ +/job_errors_titles/ +/job_error_report.md +``` + +--- + +#### 10. **No Automatic Cleanup of Old Cache** +**Severity:** 🟢 Low +**Impact:** Cache grows unbounded over time + +**Problem:** +- Old workflow run data cached forever +- Cache can grow to 100s of MB over months +- No automatic cleanup mechanism + +**Workaround:** Manually delete old cache files periodically +```bash +# Delete cache files older than 30 days +find job_* -type f -mtime +30 -delete +``` + +--- + +### 🎯 Recommended Actions + +**Immediate (Do Now):** +1. ✅ Review `CI_IMPROVEMENTS.md` for solutions to Critical issues #1, #2, #3 +2. ✅ Consider switching to `claude_review_collector_ci.sh` (solves 6 out of 10 issues) +3. ⚠️ Add cache directories to `.gitignore` + +**Short-term (This Week):** +1. Test `claude_review_collector_ci.sh` locally +2. Deploy GitHub Actions workflow for automated analysis +3. Add cache cleanup cronjob/script + +**Long-term (This Month):** +1. Build enhanced analyzer for contextual error analysis +2. Implement parallel processing for faster execution +3. Create dashboard/visualization for error trends + +--- + +### 📚 Related Documentation + +For solutions to these issues: +- **CI-Ready Version:** See `claude_review_collector_ci.sh` and `CI_IMPROVEMENTS.md` +- **GitHub Actions Setup:** See `.github/workflows/ai-review-error-analysis.yml.example` +- **Migration Guide:** See `QUICK_START_CI.md` (deleted but can be recreated) +- **Before/After Comparison:** See `BEFORE_AFTER_COMPARISON.md` +- **Complete Analysis:** See `CI_READY_SUMMARY.md` + +--- + +## Troubleshooting + +### Error: "GITHUB_USER_SESSION environment variable is not set" +You need to set the `GITHUB_USER_SESSION` environment variable. See the Setup section above. + +### Error: "gh: command not found" +Install GitHub CLI: `brew install gh` (macOS) or follow [official instructions](https://cli.github.com/manual/installation) + +### Error: "jq: command not found" +Install jq: `brew install jq` (macOS) or `apt-get install jq` (Linux) + +### Error: "gh run list" fails +Make sure GitHub CLI is authenticated: `gh auth login` + +### "No job summary found" for many runs +This could be: +1. Rate limiting (wait 1 hour) +2. Cookie expired (get new cookie) +3. Workflow runs don't have job summaries (normal for some runs) + +### Script is very slow +- First run is always slow (fetching data) +- Subsequent runs use cache (much faster) +- Consider using CI-ready version for better performance + +--- + +## Security Best Practices + +1. ✅ **DO** store credentials in environment variables +2. ✅ **DO** use a `.env` file (and gitignore it) +3. ✅ **DO** rotate your session token periodically +4. ❌ **DON'T** commit credentials to version control +5. ❌ **DON'T** share your session token +6. ❌ **DON'T** hardcode credentials in scripts + +**Note:** The CI-ready version (`claude_review_collector_ci.sh`) uses GitHub tokens instead of cookies, which is more secure and doesn't expire. See `CI_IMPROVEMENTS.md` for details. From 9a43d3a82b9366fe771a175ec6c70faab18e51a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Miko=C5=82ajczak?= Date: Fri, 17 Oct 2025 14:34:14 +0200 Subject: [PATCH 3/3] update readme --- AI_REVIEW_ERROR_REPORTER_README.md | 81 +++--------------------------- 1 file changed, 7 insertions(+), 74 deletions(-) diff --git a/AI_REVIEW_ERROR_REPORTER_README.md b/AI_REVIEW_ERROR_REPORTER_README.md index 8935cc84e128..6391736dee79 100644 --- a/AI_REVIEW_ERROR_REPORTER_README.md +++ b/AI_REVIEW_ERROR_REPORTER_README.md @@ -27,6 +27,8 @@ Main entry point that runs the entire pipeline. ```bash ./ai_review_error_reporter.sh 50 my_report.md ``` +
+ Internal scripts details ### 2. `claude_review_collector.sh` (Data Collection) Fetches workflow run data from GitHub and caches it locally. @@ -51,6 +53,9 @@ Aggregates and ranks error messages from cached data. - `directory`: Directory containing error title files (default: `job_errors_titles`) - `output_file`: Output markdown report (default: `job_error_report.md`) +
+ + ## Setup ### Prerequisites @@ -301,10 +306,6 @@ Processing run: 12345678 No job summary found, skipping... # Actually rate limited! ``` -**Workaround:** Wait 1 hour and re-run (rate limits reset hourly) - -**Better Solution Available:** `claude_review_collector_ci.sh` includes rate limit monitoring and automatic waiting. See `CI_IMPROVEMENTS.md`. - --- ### ⚠️ Medium Priority Issues @@ -338,7 +339,7 @@ Processing run: 12345678 - 100 runs: ~10 minutes - 500 runs: ~50 minutes -**Potential Solution:** Implement parallel processing with `xargs -P` or GNU `parallel`. See `CI_IMPROVEMENTS.md` for implementation examples. +**Potential Solution:** Implement parallel processing with `xargs -P` or GNU `parallel`. --- @@ -439,72 +440,4 @@ Processing run: 18589302302 find job_* -type f -mtime +30 -delete ``` ---- - -### 🎯 Recommended Actions - -**Immediate (Do Now):** -1. ✅ Review `CI_IMPROVEMENTS.md` for solutions to Critical issues #1, #2, #3 -2. ✅ Consider switching to `claude_review_collector_ci.sh` (solves 6 out of 10 issues) -3. ⚠️ Add cache directories to `.gitignore` - -**Short-term (This Week):** -1. Test `claude_review_collector_ci.sh` locally -2. Deploy GitHub Actions workflow for automated analysis -3. Add cache cleanup cronjob/script - -**Long-term (This Month):** -1. Build enhanced analyzer for contextual error analysis -2. Implement parallel processing for faster execution -3. Create dashboard/visualization for error trends - ---- - -### 📚 Related Documentation - -For solutions to these issues: -- **CI-Ready Version:** See `claude_review_collector_ci.sh` and `CI_IMPROVEMENTS.md` -- **GitHub Actions Setup:** See `.github/workflows/ai-review-error-analysis.yml.example` -- **Migration Guide:** See `QUICK_START_CI.md` (deleted but can be recreated) -- **Before/After Comparison:** See `BEFORE_AFTER_COMPARISON.md` -- **Complete Analysis:** See `CI_READY_SUMMARY.md` - ---- - -## Troubleshooting - -### Error: "GITHUB_USER_SESSION environment variable is not set" -You need to set the `GITHUB_USER_SESSION` environment variable. See the Setup section above. - -### Error: "gh: command not found" -Install GitHub CLI: `brew install gh` (macOS) or follow [official instructions](https://cli.github.com/manual/installation) - -### Error: "jq: command not found" -Install jq: `brew install jq` (macOS) or `apt-get install jq` (Linux) - -### Error: "gh run list" fails -Make sure GitHub CLI is authenticated: `gh auth login` - -### "No job summary found" for many runs -This could be: -1. Rate limiting (wait 1 hour) -2. Cookie expired (get new cookie) -3. Workflow runs don't have job summaries (normal for some runs) - -### Script is very slow -- First run is always slow (fetching data) -- Subsequent runs use cache (much faster) -- Consider using CI-ready version for better performance - ---- - -## Security Best Practices - -1. ✅ **DO** store credentials in environment variables -2. ✅ **DO** use a `.env` file (and gitignore it) -3. ✅ **DO** rotate your session token periodically -4. ❌ **DON'T** commit credentials to version control -5. ❌ **DON'T** share your session token -6. ❌ **DON'T** hardcode credentials in scripts - -**Note:** The CI-ready version (`claude_review_collector_ci.sh`) uses GitHub tokens instead of cookies, which is more secure and doesn't expire. See `CI_IMPROVEMENTS.md` for details. +--- \ No newline at end of file