From 93c27b5a37eadcbd7fa5709f152b98384d4614f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20Miko=C5=82ajczak?= <kacpermikolajczak10@gmail.com>
Date: Fri, 17 Oct 2025 13:52:53 +0200
Subject: [PATCH 1/3] v1

---
 ai_review_error_reporter.sh |  71 +++++++++++++++++++
 claude_review_collector.sh  | 136 ++++++++++++++++++++++++++++++++++++
 job_error_aggregator.sh     |  74 ++++++++++++++++++++
 3 files changed, 281 insertions(+)
 create mode 100755 ai_review_error_reporter.sh
 create mode 100755 claude_review_collector.sh
 create mode 100755 job_error_aggregator.sh

diff --git a/ai_review_error_reporter.sh b/ai_review_error_reporter.sh
new file mode 100755
index 000000000000..2d56fa964817
--- /dev/null
+++ b/ai_review_error_reporter.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Orchestrator script to collect and analyze AI review errors
+# Usage: ./ai_review_error_reporter.sh [runs_limit] [output_file]
+#   runs_limit: Number of workflow runs to fetch (default: 100)
+#   output_file: Output file for the error report (default: job_error_report.md)
+#
+# Required environment variables:
+#   GITHUB_USER_SESSION: GitHub user session cookie for authenticated requests
+#
+# Required tools:
+#   - gh (GitHub CLI): Must be authenticated
+#   - jq: JSON processor
+#   - curl: HTTP client
+
+set -e  # Exit on any error
+
+# Get script directory (where this script is located)
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+# Parameters
+RUNS_LIMIT="${1:-100}"
+OUTPUT_FILE="${2:-job_error_report.md}"
+
+echo "========================================"
+echo "AI Review Error Reporter"
+echo "========================================"
+echo ""
+echo "Configuration:"
+echo "  Workflow runs limit: $RUNS_LIMIT"
+echo "  Output report file: $OUTPUT_FILE"
+echo ""
+
+# Step 1: Collect workflow run data
+echo "========================================"
+echo "Step 1: Collecting workflow run data"
+echo "========================================"
+echo ""
+
+if [ ! -x "$SCRIPT_DIR/claude_review_collector.sh" ]; then
+    echo "Error: claude_review_collector.sh not found or not executable"
+    exit 1
+fi
+
+"$SCRIPT_DIR/claude_review_collector.sh" "$RUNS_LIMIT"
+
+echo ""
+echo "✅ Data collection completed"
+echo ""
+
+# Step 2: Aggregate and analyze errors
+echo "========================================"
+echo "Step 2: Aggregating and analyzing errors"
+echo "========================================"
+echo ""
+
+if [ ! -x "$SCRIPT_DIR/job_error_aggregator.sh" ]; then
+    echo "Error: job_error_aggregator.sh not found or not executable"
+    exit 1
+fi
+
+"$SCRIPT_DIR/job_error_aggregator.sh" "job_errors_titles" "$OUTPUT_FILE"
+
+echo ""
+echo "========================================"
+echo "✅ Report generation completed!"
+echo "========================================"
+echo ""
+echo "📊 Report saved to: $OUTPUT_FILE"
+echo ""
+
diff --git a/claude_review_collector.sh b/claude_review_collector.sh
new file mode 100755
index 000000000000..f0ea19e00f65
--- /dev/null
+++ b/claude_review_collector.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+# Script to collect and cache claude-review workflow run data
+# Usage: ./claude_review_collector.sh [limit]
+#   limit: Number of workflow runs to fetch (default: 100)
+#
+# Required environment variables:
+#   GITHUB_USER_SESSION: GitHub user session cookie for authenticated requests
+#
+# Note: GitHub CLI (gh) must be authenticated for API requests
+
+WORKFLOW="claude-review.yml"
+
+# Validate required environment variables
+if [ -z "$GITHUB_USER_SESSION" ]; then
+    echo "Error: GITHUB_USER_SESSION environment variable is not set"
+    echo "Please set it with: export GITHUB_USER_SESSION='your_session_token'"
+    exit 1
+fi
+
+# Workflow runs limit (default: 100, can be overridden by first argument)
+RUNS_LIMIT="${1:-100}"
+
+# Cache directories
+JOB_SUMMARY_URL_DIR="job_summary_url"
+JOB_SUMMARY_MD_DIR="job_summary_md"
+JOB_ERRORS_DIR="job_errors"
+JOB_ERRORS_TITLES_DIR="job_errors_titles"
+
+# Create cache directories if they don't exist
+mkdir -p "$JOB_SUMMARY_URL_DIR"
+mkdir -p "$JOB_SUMMARY_MD_DIR"
+mkdir -p "$JOB_ERRORS_DIR"
+mkdir -p "$JOB_ERRORS_TITLES_DIR"
+
+# Get run ids for the workflow
+echo "Fetching up to $RUNS_LIMIT workflow runs for '$WORKFLOW'..."
+RUNS=$(gh run list --workflow "$WORKFLOW" --status success --json databaseId,url --limit "$RUNS_LIMIT")
+
+# Count total runs
+TOTAL_RUNS=$(echo "$RUNS" | jq '. | length')
+echo "Found $TOTAL_RUNS workflow run(s)"
+echo ""
+
+# Iterate over each run and make a curl request to /attempts/1
+echo "$RUNS" | jq -r '.[] | .url' | while read -r run_url; do
+    run_id=$(basename "$run_url")
+    
+    # Define cache file paths
+    url_cache_file="${JOB_SUMMARY_URL_DIR}/${run_id}.txt"
+    md_cache_file="${JOB_SUMMARY_MD_DIR}/${run_id}.md"
+    errors_cache_file="${JOB_ERRORS_DIR}/${run_id}.md"
+    errors_titles_cache_file="${JOB_ERRORS_TITLES_DIR}/${run_id}.txt"
+    
+    echo "Processing run: ${run_id}"
+    
+    # Check if job_summary_path is cached
+    if [ -f "$url_cache_file" ]; then
+        echo "  Using cached job_summary_path"
+        job_summary_path=$(cat "$url_cache_file")
+    else
+        echo "  Fetching run page: ${run_url}"
+        html_content=$(curl -s -L -H "Cookie: user_session=$GITHUB_USER_SESSION" "$run_url")
+        
+        # Extract job summary path
+        job_summary_path=$(echo "$html_content" | grep -oE -m 1 "/Expensify/App/actions/runs/${run_id}/jobs/[0-9]+/summary_raw" | head -1)
+        
+        if [ -z "$job_summary_path" ]; then
+            echo "  No job summary found, skipping..."
+            continue
+        fi
+        
+        # Cache the job_summary_path
+        echo "$job_summary_path" > "$url_cache_file"
+        echo "  Cached job_summary_path"
+    fi
+
+    job_summary_url="https://github.com${job_summary_path}"
+    
+    # Check if job_summary_content is cached
+    if [ -f "$md_cache_file" ]; then
+        echo "  Using cached job_summary_content"
+        job_summary_content=$(cat "$md_cache_file")
+    else
+        echo "  Fetching job summary: ${job_summary_url}"
+        job_summary_content=$(curl -s -L -H "Cookie: user_session=$GITHUB_USER_SESSION" "$job_summary_url")
+        
+        # Cache the job_summary_content
+        echo "$job_summary_content" > "$md_cache_file"
+        echo "  Cached job_summary_content"
+    fi
+    
+    # Extract error blocks (content between --- delimiters) and cache them
+    if [ ! -f "$errors_cache_file" ]; then
+        # Use awk to extract blocks containing errors between --- delimiters
+        error_blocks=$(echo "$job_summary_content" | awk '
+            BEGIN { in_block=0; block="" }
+            /^---$/ { 
+                if (in_block && block ~ /❌ \*\*Error:\*\*/) {
+                    print block "---"
+                }
+                in_block=1
+                block="---\n"
+                next
+            }
+            in_block { block = block $0 "\n" }
+        ')
+        
+        if [ -n "$error_blocks" ]; then
+            echo "$error_blocks" > "$errors_cache_file"
+            echo "  Cached errors ($(echo "$error_blocks" | grep -c "^---$")/2 blocks)"
+        else
+            echo "  No errors found"
+            touch "$errors_cache_file"
+        fi
+    else
+        echo "  Errors already cached"
+    fi
+    
+    # Extract error titles from cached errors and cache them
+    if [ ! -f "$errors_titles_cache_file" ]; then
+        if [ -s "$errors_cache_file" ]; then
+            error_titles=$(grep "^❌ \*\*Error:\*\*" "$errors_cache_file")
+            if [ -n "$error_titles" ]; then
+                echo "$error_titles" > "$errors_titles_cache_file"
+                echo "  Cached error titles ($(echo "$error_titles" | wc -l | xargs) titles)"
+            else
+                touch "$errors_titles_cache_file"
+            fi
+        else
+            touch "$errors_titles_cache_file"
+        fi
+    else
+        echo "  Error titles already cached"
+    fi
+done
\ No newline at end of file
diff --git a/job_error_aggregator.sh b/job_error_aggregator.sh
new file mode 100755
index 000000000000..eb544be5f365
--- /dev/null
+++ b/job_error_aggregator.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Script to aggregate error titles from job_errors_titles directory
+# Usage: ./job_error_aggregator.sh [directory] [output_file]
+
+# Default directory or use first argument
+ERROR_TITLES_DIR="${1:-job_errors_titles}"
+OUTPUT_FILE="${2:-job_error_report.md}"
+
+# Check if directory exists
+if [ ! -d "$ERROR_TITLES_DIR" ]; then
+    echo "Error: Directory '$ERROR_TITLES_DIR' does not exist"
+    exit 1
+fi
+
+# Check if directory has any files
+file_count=$(find "$ERROR_TITLES_DIR" -type f | wc -l | xargs)
+if [ "$file_count" -eq 0 ]; then
+    echo "Error: No files found in '$ERROR_TITLES_DIR'"
+    exit 1
+fi
+
+echo "Aggregating errors from $file_count files in '$ERROR_TITLES_DIR'..."
+echo "Output will be saved to: $OUTPUT_FILE"
+echo ""
+
+# Create a temporary file to store all error messages
+temp_file=$(mktemp)
+
+# Read all files and extract error messages (removing the ❌ **Error:** prefix)
+for file in "$ERROR_TITLES_DIR"/*.txt; do
+    if [ -f "$file" ] && [ -s "$file" ]; then
+        # Extract the error message part after "❌ **Error:** "
+        sed 's/^❌ \*\*Error:\*\* //' "$file" >> "$temp_file"
+    fi
+done
+
+# Calculate summary statistics
+total_errors=$(wc -l < "$temp_file" | xargs)
+unique_errors=$(sort "$temp_file" | uniq | wc -l | xargs)
+
+# Generate report header
+{
+    echo "# Error Frequency Report"
+    echo ""
+    echo "**Generated:** $(date '+%Y-%m-%d %H:%M:%S')"
+    echo "**Source Directory:** \`$ERROR_TITLES_DIR\`"
+    echo "**Files Processed:** $file_count"
+    echo ""
+    echo "## Summary"
+    echo ""
+    echo "- **Total errors:** $total_errors"
+    echo "- **Unique errors:** $unique_errors"
+    echo ""
+    echo "## Error Breakdown"
+    echo ""
+    echo "| Count | Error Message |"
+    echo "|------:|---------------|"
+    
+    # Sort, count unique lines, sort by count (descending), and format as table
+    sort "$temp_file" | uniq -c | sort -rn | while read -r count error; do
+        printf "| %d | \`%s\` |\n" "$count" "$error"
+    done
+} > "$OUTPUT_FILE"
+
+# Display the report on console
+cat "$OUTPUT_FILE"
+
+echo ""
+echo "✅ Report saved to: $OUTPUT_FILE"
+
+# Cleanup
+rm "$temp_file"
+

From c2b0e6e3d47bc60bc9354cb1c5319eed29827a67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20Miko=C5=82ajczak?= <kacpermikolajczak10@gmail.com>
Date: Fri, 17 Oct 2025 14:26:56 +0200
Subject: [PATCH 2/3] add script readme

---
 AI_REVIEW_ERROR_REPORTER_README.md | 510 +++++++++++++++++++++++++++++
 1 file changed, 510 insertions(+)
 create mode 100644 AI_REVIEW_ERROR_REPORTER_README.md

diff --git a/AI_REVIEW_ERROR_REPORTER_README.md b/AI_REVIEW_ERROR_REPORTER_README.md
new file mode 100644
index 000000000000..8935cc84e128
--- /dev/null
+++ b/AI_REVIEW_ERROR_REPORTER_README.md
@@ -0,0 +1,510 @@
+# AI Review Error Reporter
+
+A collection of scripts to analyze and aggregate errors from GitHub Actions workflow runs for the `claude-review.yml` workflow.
+
+## Overview
+
+This toolset helps identify and track common errors in AI review workflows by:
+1. Collecting workflow run data from GitHub
+2. Extracting error messages from job summaries
+3. Aggregating and ranking errors by frequency
+
+## Scripts
+
+### 1. `ai_review_error_reporter.sh` (Main Orchestrator)
+Main entry point that runs the entire pipeline.
+
+**Usage:**
+```bash
+./ai_review_error_reporter.sh [runs_limit] [output_file]
+```
+
+**Parameters:**
+- `runs_limit`: Number of workflow runs to fetch (default: 100)
+- `output_file`: Output file for the error report (default: `job_error_report.md`)
+
+**Example:**
+```bash
+./ai_review_error_reporter.sh 50 my_report.md
+```
+
+### 2. `claude_review_collector.sh` (Data Collection)
+Fetches workflow run data from GitHub and caches it locally.
+
+**Usage:**
+```bash
+./claude_review_collector.sh [limit]
+```
+
+**Parameters:**
+- `limit`: Number of workflow runs to fetch (default: 100)
+
+### 3. `job_error_aggregator.sh` (Analysis)
+Aggregates and ranks error messages from cached data.
+
+**Usage:**
+```bash
+./job_error_aggregator.sh [directory] [output_file]
+```
+
+**Parameters:**
+- `directory`: Directory containing error title files (default: `job_errors_titles`)
+- `output_file`: Output markdown report (default: `job_error_report.md`)
+
+## Setup
+
+### Prerequisites
+
+1. **GitHub CLI (`gh`)** - Must be installed and authenticated
+   ```bash
+   # Install (macOS)
+   brew install gh
+   
+   # Authenticate
+   gh auth login
+   ```
+
+2. **jq** - JSON processor
+   ```bash
+   # Install (macOS)
+   brew install jq
+   ```
+
+3. **curl** - Usually pre-installed on macOS/Linux
+
+### Environment Variables
+
+The scripts require the following environment variable:
+
+```bash
+export GITHUB_USER_SESSION="your_github_session_token"
+```
+
+#### How to Get Your GitHub Session Token:
+
+1. Log in to GitHub in your browser
+2. Open Developer Tools (F12 or Cmd+Opt+I)
+3. Go to the "Application" or "Storage" tab
+4. Find "Cookies" → "https://github.com"
+5. Look for the `user_session` cookie
+6. Copy its value
+
+**⚠️ Security Warning:** 
+- Never commit this token to version control
+- Keep it in your local environment or a `.env` file (which should be gitignored)
+- This token provides access to your GitHub account - treat it like a password
+
+#### Recommended: Use a `.env` file
+
+Create a `.env` file in the project root (make sure it's in `.gitignore`):
+
+```bash
+# .env
+GITHUB_USER_SESSION=your_session_token_here
+```
+
+Then source it before running the scripts:
+
+```bash
+source .env
+./ai_review_error_reporter.sh
+```
+
+## Output & Caching
+
+### Cache Directories
+
+The scripts create the following directories for caching (one file per workflow run):
+
+#### `job_summary_url/`
+**Content:** Raw URLs to job summary pages  
+**Format:** Text files named `{run_id}.txt`  
+**Example:** `18589302302.txt` → `/Expensify/App/actions/runs/18589302302/jobs/41169259293/summary_raw`  
+**Purpose:** Avoids re-scraping HTML pages to find job summary URLs  
+**Use Case:** URL mapping for direct access to job summaries
+
+#### `job_summary_md/`
+**Content:** Complete job summary markdown from GitHub  
+**Format:** Markdown files named `{run_id}.md`  
+**Example:** `18589302302.md` → Full AI review summary including errors, warnings, and statistics  
+**Purpose:** Preserves the complete context of each review  
+**Size:** Typically 5-50 KB per file  
+**Use Case:** Full historical record of all AI review outputs
+
+#### `job_errors/`
+**Content:** Extracted error blocks from job summaries (between `---` delimiters)  
+**Format:** Markdown files named `{run_id}.md`  
+**Example:** Contains only the error sections with full context:
+```markdown
+---
+❌ **Error:** This command requires approval
+**File:** src/libs/actions/Report.ts
+**Details:** Command execution blocked by security policy
+---
+```
+**Purpose:** Structured error data with surrounding context  
+**Use Case:** Contextual error analysis, pattern detection, debugging
+
+#### `job_errors_titles/`
+**Content:** Just the error message titles (one per line)  
+**Format:** Text files named `{run_id}.txt`  
+**Example:**
+```
+❌ **Error:** This command requires approval
+❌ **Error:** could not determine current branch
+```
+**Purpose:** Quick frequency analysis without full context  
+**Use Case:** Current aggregation reports, trend analysis
+
+### Cached Data for Advanced Analysis
+
+> **💡 Pro Tip:** The cached data in `job_errors/` and `job_summary_md/` contains rich contextual information beyond just error titles. This data can be leveraged for:
+>
+> - **Contextual Error Analysis:** Understanding which files, functions, or code patterns trigger specific errors
+> - **Error Co-occurrence:** Identifying errors that frequently appear together in the same run
+> - **Temporal Patterns:** Analyzing how errors evolve over time or correlate with code changes
+> - **PR/Author Correlation:** Linking errors to specific pull requests or authors
+> - **Error Classification:** Automatically categorizing errors by type (permissions, syntax, rate limits, etc.)
+> - **Root Cause Analysis:** Tracing errors back to specific code changes using the full context
+> - **Predictive Analysis:** Building models to predict potential errors before they occur
+>
+> The current aggregator focuses on frequency, but the cached data supports much deeper analysis. Consider building additional tools that parse `job_errors/` for contextual insights.
+
+### Final Report
+
+The final report (`job_error_report.md` by default) includes:
+- **Generation timestamp** - When the report was created
+- **Source information** - Which cache directory was analyzed
+- **Files processed** - Number of workflow runs included
+- **Summary statistics:**
+  - Total error count
+  - Unique error types
+- **Frequency-ranked table** - Errors sorted by occurrence count
+- **Error messages** - Full error text for each unique error
+
+Example report structure:
+```markdown
+# Error Frequency Report
+
+**Generated:** 2024-10-17 13:00:18
+**Source Directory:** `job_errors_titles`
+**Files Processed:** 50
+
+## Summary
+- **Total errors:** 127
+- **Unique errors:** 15
+
+## Error Breakdown
+| Count | Error Message |
+|------:|---------------|
+| 45    | `This command requires approval` |
+| 23    | `could not determine current branch` |
+...
+```
+
+### Caching Behavior
+
+The scripts use aggressive caching to avoid redundant API calls:
+- ✅ **Once a workflow run is processed, all its data is cached locally**
+- ✅ **Re-running the scripts will use cached data for previously processed runs**
+- ✅ **New runs are automatically fetched and added to the cache**
+- ✅ **Cache persists across script invocations** (stored on disk)
+- ⚠️ **To force a refresh of specific runs, delete their corresponding cache files**
+- ⚠️ **To start fresh, delete entire cache directories**
+
+**Cache Efficiency:**
+- First run (100 workflows): ~8-10 minutes
+- Subsequent run (same 100): ~30 seconds (uses cache)
+- Adding 10 new runs: ~1 minute (only fetches new data)
+
+## Usage Examples
+
+### Quick Analysis (Last 10 runs)
+```bash
+./ai_review_error_reporter.sh 10
+```
+
+### Comprehensive Analysis (Last 200 runs)
+```bash
+./ai_review_error_reporter.sh 200 comprehensive_report.md
+```
+
+### Re-analyze Cached Data
+```bash
+# Just run the aggregator on already-cached data
+./job_error_aggregator.sh job_errors_titles updated_report.md
+```
+
+## Known Issues & Limitations
+
+This section documents the current limitations and burning issues with the AI Review Error Reporter scripts.
+
+### 🔥 Critical Issues
+
+#### 1. **Authentication Method: Cookie-Based (Fragile)**
+**Severity:** 🔴 High  
+**Impact:** Script fails when cookie expires (typically every 30 days)
+
+**Problem:**
+- Uses `GITHUB_USER_SESSION` cookie which requires manual extraction from browser
+- Cookie expires periodically, requiring manual renewal
+- Not suitable for automated/CI environments
+- Tied to a personal GitHub account
+
+**Workaround:**
+```bash
+# Manual process required:
+# 1. Open GitHub in browser
+# 2. Open DevTools > Application > Cookies
+# 3. Find user_session cookie
+# 4. Copy and export:
+export GITHUB_USER_SESSION="new_cookie_value"
+```
+
+---
+
+#### 2. **HTML Scraping Instead of Official API**
+**Severity:** 🔴 High  
+**Impact:** Breaks if GitHub changes their HTML structure
+
+**Problem:**
+- Scrapes HTML pages to find job summary URLs:
+  ```bash
+  grep -oE -m 1 "/Expensify/App/actions/runs/${run_id}/jobs/[0-9]+/summary_raw"
+  ```
+- Fragile: Any change to GitHub's HTML breaks the script
+- Requires authentication cookies (can't use API tokens)
+- Slower than direct API access
+- Not officially supported by GitHub
+
+**Why:** Job summaries are not available through the official GitHub Actions API.
+
+**Better Solution Available:**
+Make the reviewer output its logs to GH artifacts instead as well as in summary. Artifacts can be retrieved via official GH CLI.
+
+---
+
+#### 3. **No Rate Limiting Handling**
+**Severity:** 🟡 Medium  
+**Impact:** Script fails silently when hitting GitHub API rate limits
+
+**Problem:**
+- No detection of rate limit status
+- No retry logic when rate limited
+- No waiting/backoff mechanism
+- Silent failures that are hard to debug
+
+**Manifestation:**
+```bash
+# Runs fine for 50-60 requests, then:
+Processing run: 12345678
+  No job summary found, skipping...  # Actually rate limited!
+```
+
+**Workaround:** Wait 1 hour and re-run (rate limits reset hourly)
+
+**Better Solution Available:** `claude_review_collector_ci.sh` includes rate limit monitoring and automatic waiting. See `CI_IMPROVEMENTS.md`.
+
+---
+
+### ⚠️ Medium Priority Issues
+
+#### 4. **Hardcoded Repository Path**
+**Severity:** 🟡 Medium  
+**Impact:** Can't easily use with other repositories
+
+**Problem:**
+```bash
+# Hardcoded in the regex:
+/Expensify/App/actions/runs/${run_id}/jobs/[0-9]+/summary_raw
+```
+
+**Workaround:** Edit the script to change repository path
+
+---
+
+#### 5. **Sequential Processing (Slow for Large Batches)**
+**Severity:** 🟡 Medium  
+**Impact:** Takes ~10-15 minutes to process 100 workflow runs
+
+**Problem:**
+- Processes one run at a time in a while loop
+- Network latency multiplied by number of runs
+- Could be parallelized for 5-10x speedup
+
+**Current Performance:**
+- 10 runs: ~1 minute
+- 50 runs: ~5 minutes
+- 100 runs: ~10 minutes
+- 500 runs: ~50 minutes
+
+**Potential Solution:** Implement parallel processing with `xargs -P` or GNU `parallel`. See `CI_IMPROVEMENTS.md` for implementation examples.
+
+---
+
+#### 6. **No CI/CD Integration**
+**Severity:** 🟡 Medium  
+**Impact:** Can't run automatically in GitHub Actions or other CI systems
+
+**Problem:**
+- Requires manual cookie setup (not available in CI)
+- No GitHub Actions workflow provided
+- Can't leverage Actions cache for faster runs
+- No automatic scheduling
+
+---
+
+#### 7. **Limited Error Context in Reports**
+**Severity:** 🟡 Medium  
+**Impact:** Hard to understand root causes from aggregated reports
+
+**Problem:**
+- Current aggregator only shows error titles
+- Loses valuable context from `job_errors/` directory:
+  - Which files triggered errors
+  - Which PRs/branches were involved
+  - Full error details and stack traces
+  - Temporal patterns
+
+**Example - What's Available:**
+```markdown
+| Count | Error Message |
+|------:|---------------|
+| 45    | `This command requires approval` |
+```
+
+**Example - What Could Be Available:**
+```markdown
+| Count | Error | Common Files | Common PRs | Trend |
+|------:|-------|--------------|------------|-------|
+| 45    | Command approval | src/libs/actions/*.ts | #12345, #12389 | ↑ +15% |
+```
+
+**Potential Solution:** Build enhanced analyzer that parses `job_errors/` for contextual data. See "Cached Data for Advanced Analysis" section above for ideas.
+
+---
+
+### 📝 Minor Issues
+
+#### 8. **No Progress Indicators for Long Operations**
+**Severity:** 🟢 Low  
+**Impact:** Appears stuck during long runs
+
+**Problem:**
+```bash
+Processing run: 18589302302
+  Fetching job summary: https://github.com/...
+  # Appears frozen here for 5-10 seconds
+```
+
+**Workaround:** Be patient, check network activity
+
+**Potential Solution:** Add progress bars or timestamps to output
+
+---
+
+#### 9. **Cache Directories Not in .gitignore**
+**Severity:** 🟢 Low  
+**Impact:** Risk of committing large cache files
+
+**Problem:**
+- Cache directories (`job_summary_url/`, `job_summary_md/`, etc.) not automatically ignored
+- Could accidentally commit 100s of cached files
+- Increases repository size
+
+**Solution:** Add to `.gitignore`:
+```gitignore
+# AI Review Error Reporter cache
+/job_summary_url/
+/job_summary_md/
+/job_errors/
+/job_errors_titles/
+/job_error_report.md
+```
+
+---
+
+#### 10. **No Automatic Cleanup of Old Cache**
+**Severity:** 🟢 Low  
+**Impact:** Cache grows unbounded over time
+
+**Problem:**
+- Old workflow run data cached forever
+- Cache can grow to 100s of MB over months
+- No automatic cleanup mechanism
+
+**Workaround:** Manually delete old cache files periodically
+```bash
+# Delete cache files older than 30 days
+find job_* -type f -mtime +30 -delete
+```
+
+---
+
+### 🎯 Recommended Actions
+
+**Immediate (Do Now):**
+1. ✅ Review `CI_IMPROVEMENTS.md` for solutions to Critical issues #1, #2, #3
+2. ✅ Consider switching to `claude_review_collector_ci.sh` (solves 6 out of 10 issues)
+3. ⚠️ Add cache directories to `.gitignore`
+
+**Short-term (This Week):**
+1. Test `claude_review_collector_ci.sh` locally
+2. Deploy GitHub Actions workflow for automated analysis
+3. Add cache cleanup cronjob/script
+
+**Long-term (This Month):**
+1. Build enhanced analyzer for contextual error analysis
+2. Implement parallel processing for faster execution
+3. Create dashboard/visualization for error trends
+
+---
+
+### 📚 Related Documentation
+
+For solutions to these issues:
+- **CI-Ready Version:** See `claude_review_collector_ci.sh` and `CI_IMPROVEMENTS.md`
+- **GitHub Actions Setup:** See `.github/workflows/ai-review-error-analysis.yml.example`
+- **Migration Guide:** See `QUICK_START_CI.md` (deleted but can be recreated)
+- **Before/After Comparison:** See `BEFORE_AFTER_COMPARISON.md`
+- **Complete Analysis:** See `CI_READY_SUMMARY.md`
+
+---
+
+## Troubleshooting
+
+### Error: "GITHUB_USER_SESSION environment variable is not set"
+You need to set the `GITHUB_USER_SESSION` environment variable. See the Setup section above.
+
+### Error: "gh: command not found"
+Install GitHub CLI: `brew install gh` (macOS) or follow [official instructions](https://cli.github.com/manual/installation)
+
+### Error: "jq: command not found"
+Install jq: `brew install jq` (macOS) or `apt-get install jq` (Linux)
+
+### Error: "gh run list" fails
+Make sure GitHub CLI is authenticated: `gh auth login`
+
+### "No job summary found" for many runs
+This could be:
+1. Rate limiting (wait 1 hour)
+2. Cookie expired (get new cookie)
+3. Workflow runs don't have job summaries (normal for some runs)
+
+### Script is very slow
+- First run is always slow (fetching data)
+- Subsequent runs use cache (much faster)
+- Consider using CI-ready version for better performance
+
+---
+
+## Security Best Practices
+
+1. ✅ **DO** store credentials in environment variables
+2. ✅ **DO** use a `.env` file (and gitignore it)
+3. ✅ **DO** rotate your session token periodically
+4. ❌ **DON'T** commit credentials to version control
+5. ❌ **DON'T** share your session token
+6. ❌ **DON'T** hardcode credentials in scripts
+
+**Note:** The CI-ready version (`claude_review_collector_ci.sh`) uses GitHub tokens instead of cookies, which is more secure and doesn't expire. See `CI_IMPROVEMENTS.md` for details.

From 9a43d3a82b9366fe771a175ec6c70faab18e51a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20Miko=C5=82ajczak?= <kacpermikolajczak10@gmail.com>
Date: Fri, 17 Oct 2025 14:34:14 +0200
Subject: [PATCH 3/3] update readme

---
 AI_REVIEW_ERROR_REPORTER_README.md | 81 +++---------------------------
 1 file changed, 7 insertions(+), 74 deletions(-)

diff --git a/AI_REVIEW_ERROR_REPORTER_README.md b/AI_REVIEW_ERROR_REPORTER_README.md
index 8935cc84e128..6391736dee79 100644
--- a/AI_REVIEW_ERROR_REPORTER_README.md
+++ b/AI_REVIEW_ERROR_REPORTER_README.md
@@ -27,6 +27,8 @@ Main entry point that runs the entire pipeline.
 ```bash
 ./ai_review_error_reporter.sh 50 my_report.md
 ```
+<details>
+<summary> Internal scripts details </summary>
 
 ### 2. `claude_review_collector.sh` (Data Collection)
 Fetches workflow run data from GitHub and caches it locally.
@@ -51,6 +53,9 @@ Aggregates and ranks error messages from cached data.
 - `directory`: Directory containing error title files (default: `job_errors_titles`)
 - `output_file`: Output markdown report (default: `job_error_report.md`)
 
+</details>
+
+
 ## Setup
 
 ### Prerequisites
@@ -301,10 +306,6 @@ Processing run: 12345678
   No job summary found, skipping...  # Actually rate limited!
 ```
 
-**Workaround:** Wait 1 hour and re-run (rate limits reset hourly)
-
-**Better Solution Available:** `claude_review_collector_ci.sh` includes rate limit monitoring and automatic waiting. See `CI_IMPROVEMENTS.md`.
-
 ---
 
 ### ⚠️ Medium Priority Issues
@@ -338,7 +339,7 @@ Processing run: 12345678
 - 100 runs: ~10 minutes
 - 500 runs: ~50 minutes
 
-**Potential Solution:** Implement parallel processing with `xargs -P` or GNU `parallel`. See `CI_IMPROVEMENTS.md` for implementation examples.
+**Potential Solution:** Implement parallel processing with `xargs -P` or GNU `parallel`.
 
 ---
 
@@ -439,72 +440,4 @@ Processing run: 18589302302
 find job_* -type f -mtime +30 -delete
 ```
 
----
-
-### 🎯 Recommended Actions
-
-**Immediate (Do Now):**
-1. ✅ Review `CI_IMPROVEMENTS.md` for solutions to Critical issues #1, #2, #3
-2. ✅ Consider switching to `claude_review_collector_ci.sh` (solves 6 out of 10 issues)
-3. ⚠️ Add cache directories to `.gitignore`
-
-**Short-term (This Week):**
-1. Test `claude_review_collector_ci.sh` locally
-2. Deploy GitHub Actions workflow for automated analysis
-3. Add cache cleanup cronjob/script
-
-**Long-term (This Month):**
-1. Build enhanced analyzer for contextual error analysis
-2. Implement parallel processing for faster execution
-3. Create dashboard/visualization for error trends
-
----
-
-### 📚 Related Documentation
-
-For solutions to these issues:
-- **CI-Ready Version:** See `claude_review_collector_ci.sh` and `CI_IMPROVEMENTS.md`
-- **GitHub Actions Setup:** See `.github/workflows/ai-review-error-analysis.yml.example`
-- **Migration Guide:** See `QUICK_START_CI.md` (deleted but can be recreated)
-- **Before/After Comparison:** See `BEFORE_AFTER_COMPARISON.md`
-- **Complete Analysis:** See `CI_READY_SUMMARY.md`
-
----
-
-## Troubleshooting
-
-### Error: "GITHUB_USER_SESSION environment variable is not set"
-You need to set the `GITHUB_USER_SESSION` environment variable. See the Setup section above.
-
-### Error: "gh: command not found"
-Install GitHub CLI: `brew install gh` (macOS) or follow [official instructions](https://cli.github.com/manual/installation)
-
-### Error: "jq: command not found"
-Install jq: `brew install jq` (macOS) or `apt-get install jq` (Linux)
-
-### Error: "gh run list" fails
-Make sure GitHub CLI is authenticated: `gh auth login`
-
-### "No job summary found" for many runs
-This could be:
-1. Rate limiting (wait 1 hour)
-2. Cookie expired (get new cookie)
-3. Workflow runs don't have job summaries (normal for some runs)
-
-### Script is very slow
-- First run is always slow (fetching data)
-- Subsequent runs use cache (much faster)
-- Consider using CI-ready version for better performance
-
----
-
-## Security Best Practices
-
-1. ✅ **DO** store credentials in environment variables
-2. ✅ **DO** use a `.env` file (and gitignore it)
-3. ✅ **DO** rotate your session token periodically
-4. ❌ **DON'T** commit credentials to version control
-5. ❌ **DON'T** share your session token
-6. ❌ **DON'T** hardcode credentials in scripts
-
-**Note:** The CI-ready version (`claude_review_collector_ci.sh`) uses GitHub tokens instead of cookies, which is more secure and doesn't expire. See `CI_IMPROVEMENTS.md` for details.
+---
\ No newline at end of file