update: add judges summaries per eval (#9)

tmickleydoyle · Aslemammad · web-flow · commit dec0c92d35a1 · 2025-11-03T20:11:22.000+03:30
* update: add judges summaries per eval

* update: change location of judge summary

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

---------

Co-authored-by: Mohammad Bagher Abiyat &lt;37929992+Aslemammad@users.noreply.github.com&gt;
diff --git a/.github/workflows/benchmark-reusable.yml b/.github/workflows/benchmark-reusable.yml
@@ -17,6 +17,28 @@ permissions:
   actions: read
 
 jobs:
+  prepare-analysis:
+    name: Prepare Judge Analysis Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      evals: ${{ steps.compute.outputs.evals }}
+    steps:
+      - name: Extract unique evaluations
+        id: compute
+        env:
+          MATRIX_JSON: ${{ inputs.matrix }}
+        run: |
+          set -euo pipefail
+          evals=$(jq -c '.include | unique_by(.eval) | map({ eval: .eval, safe: (.eval | gsub("/"; "-")) })' <<<"$MATRIX_JSON")
+
+          if [ -z "${evals}" ] || [ "${evals}" = "null" ]; then
+            echo "No evaluations found in matrix definition." >&2
+            evals="[]"
+          fi
+
+          echo "Analysis eval matrix: ${evals}"
+          echo "evals=${evals}" >> "$GITHUB_OUTPUT"
+
   benchmark:
     name: Benchmark ${{ matrix.agent }} / ${{ matrix.model }} / ${{ matrix.eval }}
     runs-on: ubuntu-latest
@@ -167,9 +189,111 @@ jobs:
           name: ${{ steps.artifact.outputs.name }}
           path: benchmark.json
 
+  eval-analysis:
+    name: Judge Analysis - ${{ matrix.eval }}
+    runs-on: ubuntu-latest
+    needs:
+      - benchmark
+      - prepare-analysis
+    if: needs.prepare-analysis.outputs.evals != '[]'
+    environment: production
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJSON(needs.prepare-analysis.outputs.evals) }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v1
+        with:
+          bun-version: 1.2.21
+
+      - name: Install dependencies
+        run: bun install --frozen-lockfile
+
+      - name: Download benchmark artifacts for eval
+        uses: actions/download-artifact@v4
+        with:
+          path: eval-benchmarks
+          pattern: benchmark-*-*-${{ matrix.safe }}
+
+      - name: Merge benchmark exports
+        id: merge
+        run: |
+          set -euo pipefail
+
+          if [ ! -d eval-benchmarks ] || ! find eval-benchmarks -type f -name '*.json' -print -quit | grep -q .; then
+            echo "No benchmark artifacts found for eval ${{ matrix.eval }}; skipping analysis."
+            echo "has_data=false" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          bun run scripts/merge-benchmark-exports.ts eval-benchmarks merged-benchmark.json
+          echo "Merged benchmark export ready for analysis."
+          echo "has_data=true" >> "$GITHUB_OUTPUT"
+
+      - name: Run judges analysis
+        if: steps.merge.outputs.has_data == 'true'
+        id: analysis
+        env:
+          OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          CODEX_API_KEY: ${{ secrets.CODEX_API_KEY }}
+        run: |
+          set -euo pipefail
+          bun run scripts/analysis.ts merged-benchmark.json > analysis.txt
+          cat analysis.txt
+
+      - name: Determine analysis job URL
+        if: steps.merge.outputs.has_data == 'true'
+        id: analysis_url
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          MATRIX_EVAL: ${{ matrix.eval }}
+          MATRIX_SAFE: ${{ matrix.safe }}
+        run: |
+          set -euo pipefail
+          job_pattern="Judge Analysis - ${MATRIX_EVAL}"
+          jobs_endpoint="https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100"
+          job_json="$(curl -fsSL \
+            -H "Authorization: token ${GITHUB_TOKEN}" \
+            -H "Accept: application/vnd.github+json" \
+            "${jobs_endpoint}")"
+
+          job_url="$(printf '%s\n' "${job_json}" \
+            | jq -r --arg pattern "$job_pattern" 'select(type=="object" and has("jobs")) | .jobs[] | select(.name | contains($pattern)) | .html_url' \
+            | head -n 1)"
+
+          if [ -z "${job_url}" ] || [ "${job_url}" = "null" ]; then
+            echo "Failed to determine job URL for pattern ${job_pattern}" >&2
+            printf '%s\n' "${job_json}" | jq -r '.jobs[]?.name' >&2 || true
+            exit 1
+          fi
+
+          step_url="${job_url}#step:7:0"
+
+          jq -n \
+            --arg eval "${MATRIX_EVAL}" \
+            --arg safe "${MATRIX_SAFE}" \
+            --arg url "${step_url}" \
+            '{eval: $eval, safe: $safe, url: $url}' > analysis-info.json
+
+      - name: Upload analysis artifact
+        if: steps.merge.outputs.has_data == 'true'
+        uses: actions/upload-artifact@v4
+        with:
+          name: analysis-${{ matrix.safe }}
+          path: |
+            analysis.txt
+            analysis-info.json
+
   notify:
     runs-on: ubuntu-latest
-    needs: benchmark
+    needs:
+      - benchmark
+      - eval-analysis
     if: needs.benchmark.result == 'success'
     environment: production
     steps:
@@ -189,12 +313,30 @@ jobs:
         with:
           path: benchmarks
 
+      - name: Download analysis artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: analysis
+          pattern: analysis-*
+
       - name: Merge benchmark exports
         run: bun run scripts/merge-benchmark-exports.ts benchmarks merged-benchmark.json
 
+      - name: Build analysis link map
+        run: |
+          set -euo pipefail
+          mkdir -p analysis
+          mapfile -d '' info_files < <(find analysis -type f -name 'analysis-info.json' -print0 2>/dev/null || true)
+          if [ "${#info_files[@]}" -eq 0 ]; then
+            echo "[]" > analysis/analysis-links.json
+          else
+            jq -s 'map({eval: .eval, url: .url, safe: .safe})' "${info_files[@]}" > analysis/analysis-links.json
+          fi
+
       - name: Send Discord notification
         env:
           DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }}
+          ANALYSIS_LINKS_FILE: ${{ github.workspace }}/analysis/analysis-links.json
         run: |
           set -euo pipefail
           if [ ! -f merged-benchmark.json ]; then
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 node_modules
 dist
 benchmark.json
+results/
diff --git a/agents/claude-code.ts b/agents/claude-code.ts
@@ -12,7 +12,7 @@ import type {
 const sessionCache = new Map<string, string>();
 
 export const models: string[] = [
-  // "claude-sonnet-4-5",
+  "claude-sonnet-4-5",
   // "claude-sonnet-4",
   // "claude-opus-4-1",
   // "claude-3-5-haiku",
diff --git a/agents/codex.ts b/agents/codex.ts
@@ -22,7 +22,7 @@ const codexClient = new Codex();
 const threadCache = new Map<string, Thread>();
 
 export const models: string[] = [
-  // "gpt-5-codex",
+  "gpt-5-codex",
   // "gpt-5",
   // "o3",
   // "o4-mini"
@@ -70,10 +70,14 @@ function logTurnItems(
     try {
       writeLog(process.stdout, JSON.stringify(item), options?.logPrefix);
     } catch (error) {
-      const fallback = isCommandExecutionItem(item)
+      const sanitizedItem = isCommandExecutionItem(item)
         ? { ...item, aggregated_output: "<omitted>" }
         : item;
-      writeLog(process.stdout, JSON.stringify(fallback), options?.logPrefix);
+      writeLog(
+        process.stdout,
+        JSON.stringify(sanitizedItem),
+        options?.logPrefix,
+      );
       if (error instanceof Error) {
         writeLog(
           process.stderr,
diff --git a/agents/opencode.ts b/agents/opencode.ts
@@ -71,9 +71,9 @@ const sessionCache = new Map<string, string>();
 
 export const models: string[] = [
   // "opencode/gpt-5",
-  "opencode/gpt-5-codex",
-  // "opencode/claude-sonnet-4-5",
-  // "opencode/big-pickle",
+  // "opencode/gpt-5-codex",
+  "opencode/claude-sonnet-4-5",
+  "opencode/big-pickle",
   // "opencode/claude-sonnet-4",
   // "opencode/claude-3-5-haiku",
   // "opencode/claude-opus-4-1",
diff --git a/cli.ts b/cli.ts
@@ -32,11 +32,7 @@ import type {
   AggregationSummary,
   ScoreAggregationInput,
 } from "~/lib/utils/scoreAggregation.js";
-import type {
-  Episode,
-  EvaluationRunExport,
-  Usage,
-} from "~/types/export.js";
+import type { Episode, EvaluationRunExport, Usage } from "~/types/export.js";
 import { withRetries, withTimeout } from "~/lib/utils/retry.js";
 import { buildRadarChartUrl } from "~/lib/charts.js";
 
@@ -769,7 +765,7 @@ function summarizeAggregation(
       to: datasetEval.to,
     },
     model,
-    jobUrl: process.env.GITHUB_BENCHMARK_JOB_URL,
+    jobUrl: process.env.GITHUB_BENCHMARK_JOB_URL!,
     finalScore: aggregation.finalScore,
     baseScore: aggregation.baseScore,
     variancePenalty: aggregation.variancePenalty,
diff --git a/dataset.yaml b/dataset.yaml
@@ -41,7 +41,6 @@
         commands:
           - ./.venv/bin/pytest -vv
           - ./.venv/bin/flake8 datadog_lambda/
-
 - repo: AlaminPu1007/algorithm-visualizer
   from: ca409519ec96a83ec8d6c2ba30f2487f8d601719
   to: 21845e972dd8e2378cbcd16accc5ae8cdd37acb2
diff --git a/judges.ts b/judges.ts
@@ -1,17 +1,14 @@
 import type { Judge } from "~/lib/judgeTypes.js";
 import { getZenLanguageModel } from "~/lib/zenModels.js";
 
-const fallback = (envName: string, defaultValue: string): string =>
-  process.env[envName]?.trim() || defaultValue;
-
 function resolveJudgeModelId(judgeName: Judge["name"]): string {
   switch (judgeName) {
     case "claude-4.5":
-      return fallback("CLAUDE_MODEL", "opencode/claude-sonnet-4-5");
+      return "opencode/claude-sonnet-4-5";
     case "gpt-5-codex":
-      return fallback("GPT5_CODEX_MODEL", "opencode/gpt-5-codex");
+      return "opencode/gpt-5-codex";
     case "kimi":
-      return fallback("KIMI_MODEL", "opencode/kimi-k2");
+      return "opencode/kimi-k2";
     default:
       return judgeName;
   }
diff --git a/lib/planner.ts b/lib/planner.ts
@@ -5,9 +5,6 @@ import type { DatasetEval } from "~/lib/dataset.js";
 import { plannerExamples } from "~/lib/plannerExamples.js";
 import { getZenLanguageModel } from "~/lib/zenModels.js";
 
-const fallback = (envName: string, defaultValue: string): string =>
-  process.env[envName]?.trim() || defaultValue;
-
 export interface PlannerCommitDiff {
   sha: string;
   title: string;
@@ -61,7 +58,7 @@ What NOT to include:
 
 Always respond strictly as JSON conforming to the schema. Do not add commentary.`;
 
-const plannerModelId = fallback("PLANNER_MODEL", "opencode/claude-sonnet-4-5");
+const plannerModelId = "opencode/claude-sonnet-4-5";
 
 function buildSystemPrompt(): string {
   if (plannerExamples.length === 0) {
diff --git a/lib/summarizer.ts b/lib/summarizer.ts
@@ -4,9 +4,6 @@ import { z } from "zod";
 import type { DatasetEval } from "~/lib/dataset.js";
 import { getZenLanguageModel } from "~/lib/zenModels.js";
 
-const fallback = (envName: string, defaultValue: string): string =>
-  process.env[envName]?.trim() || defaultValue;
-
 export interface EpisodeActions {
   episodeIndex: number;
   actions: string[];
@@ -43,10 +40,7 @@ Guidelines:
 - Note any errors or issues encountered
 - Be objective and descriptive, not evaluative`;
 
-const summarizerModelId = fallback(
-  "SUMMARIZER_MODEL",
-  "opencode/claude-sonnet-4-5",
-);
+const summarizerModelId = "opencode/claude-sonnet-4-5";
 
 export async function generateActionsSummary(
   evaluation: DatasetEval,
diff --git a/scripts/analysis.ts b/scripts/analysis.ts
diff --git a/scripts/discord-sample.ts b/scripts/discord-sample.ts
diff --git a/scripts/merge-benchmark-exports.ts b/scripts/merge-benchmark-exports.ts
diff --git a/types/export.ts b/types/export.ts