Skip to content

Commit dec0c92

Browse files
update: add judges summaries per eval (#9)
* update: add judges summaries per eval * update: change location of judge summary * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip --------- Co-authored-by: Mohammad Bagher Abiyat <37929992+Aslemammad@users.noreply.github.com>
1 parent d34556b commit dec0c92

File tree

14 files changed

+428
-45
lines changed

14 files changed

+428
-45
lines changed

.github/workflows/benchmark-reusable.yml

Lines changed: 143 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,28 @@ permissions:
1717
actions: read
1818

1919
jobs:
20+
prepare-analysis:
21+
name: Prepare Judge Analysis Matrix
22+
runs-on: ubuntu-latest
23+
outputs:
24+
evals: ${{ steps.compute.outputs.evals }}
25+
steps:
26+
- name: Extract unique evaluations
27+
id: compute
28+
env:
29+
MATRIX_JSON: ${{ inputs.matrix }}
30+
run: |
31+
set -euo pipefail
32+
evals=$(jq -c '.include | unique_by(.eval) | map({ eval: .eval, safe: (.eval | gsub("/"; "-")) })' <<<"$MATRIX_JSON")
33+
34+
if [ -z "${evals}" ] || [ "${evals}" = "null" ]; then
35+
echo "No evaluations found in matrix definition." >&2
36+
evals="[]"
37+
fi
38+
39+
echo "Analysis eval matrix: ${evals}"
40+
echo "evals=${evals}" >> "$GITHUB_OUTPUT"
41+
2042
benchmark:
2143
name: Benchmark ${{ matrix.agent }} / ${{ matrix.model }} / ${{ matrix.eval }}
2244
runs-on: ubuntu-latest
@@ -167,9 +189,111 @@ jobs:
167189
name: ${{ steps.artifact.outputs.name }}
168190
path: benchmark.json
169191

192+
eval-analysis:
193+
name: Judge Analysis - ${{ matrix.eval }}
194+
runs-on: ubuntu-latest
195+
needs:
196+
- benchmark
197+
- prepare-analysis
198+
if: needs.prepare-analysis.outputs.evals != '[]'
199+
environment: production
200+
strategy:
201+
fail-fast: false
202+
matrix:
203+
include: ${{ fromJSON(needs.prepare-analysis.outputs.evals) }}
204+
steps:
205+
- name: Checkout repository
206+
uses: actions/checkout@v4
207+
208+
- name: Setup Bun
209+
uses: oven-sh/setup-bun@v1
210+
with:
211+
bun-version: 1.2.21
212+
213+
- name: Install dependencies
214+
run: bun install --frozen-lockfile
215+
216+
- name: Download benchmark artifacts for eval
217+
uses: actions/download-artifact@v4
218+
with:
219+
path: eval-benchmarks
220+
pattern: benchmark-*-*-${{ matrix.safe }}
221+
222+
- name: Merge benchmark exports
223+
id: merge
224+
run: |
225+
set -euo pipefail
226+
227+
if [ ! -d eval-benchmarks ] || ! find eval-benchmarks -type f -name '*.json' -print -quit | grep -q .; then
228+
echo "No benchmark artifacts found for eval ${{ matrix.eval }}; skipping analysis."
229+
echo "has_data=false" >> "$GITHUB_OUTPUT"
230+
exit 0
231+
fi
232+
233+
bun run scripts/merge-benchmark-exports.ts eval-benchmarks merged-benchmark.json
234+
echo "Merged benchmark export ready for analysis."
235+
echo "has_data=true" >> "$GITHUB_OUTPUT"
236+
237+
- name: Run judges analysis
238+
if: steps.merge.outputs.has_data == 'true'
239+
id: analysis
240+
env:
241+
OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
242+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
243+
CODEX_API_KEY: ${{ secrets.CODEX_API_KEY }}
244+
run: |
245+
set -euo pipefail
246+
bun run scripts/analysis.ts merged-benchmark.json > analysis.txt
247+
cat analysis.txt
248+
249+
- name: Determine analysis job URL
250+
if: steps.merge.outputs.has_data == 'true'
251+
id: analysis_url
252+
env:
253+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
254+
MATRIX_EVAL: ${{ matrix.eval }}
255+
MATRIX_SAFE: ${{ matrix.safe }}
256+
run: |
257+
set -euo pipefail
258+
job_pattern="Judge Analysis - ${MATRIX_EVAL}"
259+
jobs_endpoint="https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100"
260+
job_json="$(curl -fsSL \
261+
-H "Authorization: token ${GITHUB_TOKEN}" \
262+
-H "Accept: application/vnd.github+json" \
263+
"${jobs_endpoint}")"
264+
265+
job_url="$(printf '%s\n' "${job_json}" \
266+
| jq -r --arg pattern "$job_pattern" 'select(type=="object" and has("jobs")) | .jobs[] | select(.name | contains($pattern)) | .html_url' \
267+
| head -n 1)"
268+
269+
if [ -z "${job_url}" ] || [ "${job_url}" = "null" ]; then
270+
echo "Failed to determine job URL for pattern ${job_pattern}" >&2
271+
printf '%s\n' "${job_json}" | jq -r '.jobs[]?.name' >&2 || true
272+
exit 1
273+
fi
274+
275+
step_url="${job_url}#step:7:0"
276+
277+
jq -n \
278+
--arg eval "${MATRIX_EVAL}" \
279+
--arg safe "${MATRIX_SAFE}" \
280+
--arg url "${step_url}" \
281+
'{eval: $eval, safe: $safe, url: $url}' > analysis-info.json
282+
283+
- name: Upload analysis artifact
284+
if: steps.merge.outputs.has_data == 'true'
285+
uses: actions/upload-artifact@v4
286+
with:
287+
name: analysis-${{ matrix.safe }}
288+
path: |
289+
analysis.txt
290+
analysis-info.json
291+
170292
notify:
171293
runs-on: ubuntu-latest
172-
needs: benchmark
294+
needs:
295+
- benchmark
296+
- eval-analysis
173297
if: needs.benchmark.result == 'success'
174298
environment: production
175299
steps:
@@ -189,12 +313,30 @@ jobs:
189313
with:
190314
path: benchmarks
191315

316+
- name: Download analysis artifacts
317+
uses: actions/download-artifact@v4
318+
with:
319+
path: analysis
320+
pattern: analysis-*
321+
192322
- name: Merge benchmark exports
193323
run: bun run scripts/merge-benchmark-exports.ts benchmarks merged-benchmark.json
194324

325+
- name: Build analysis link map
326+
run: |
327+
set -euo pipefail
328+
mkdir -p analysis
329+
mapfile -d '' info_files < <(find analysis -type f -name 'analysis-info.json' -print0 2>/dev/null || true)
330+
if [ "${#info_files[@]}" -eq 0 ]; then
331+
echo "[]" > analysis/analysis-links.json
332+
else
333+
jq -s 'map({eval: .eval, url: .url, safe: .safe})' "${info_files[@]}" > analysis/analysis-links.json
334+
fi
335+
195336
- name: Send Discord notification
196337
env:
197338
DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }}
339+
ANALYSIS_LINKS_FILE: ${{ github.workspace }}/analysis/analysis-links.json
198340
run: |
199341
set -euo pipefail
200342
if [ ! -f merged-benchmark.json ]; then

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
node_modules
22
dist
33
benchmark.json
4+
results/

agents/claude-code.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import type {
1212
const sessionCache = new Map<string, string>();
1313

1414
export const models: string[] = [
15-
// "claude-sonnet-4-5",
15+
"claude-sonnet-4-5",
1616
// "claude-sonnet-4",
1717
// "claude-opus-4-1",
1818
// "claude-3-5-haiku",

agents/codex.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ const codexClient = new Codex();
2222
const threadCache = new Map<string, Thread>();
2323

2424
export const models: string[] = [
25-
// "gpt-5-codex",
25+
"gpt-5-codex",
2626
// "gpt-5",
2727
// "o3",
2828
// "o4-mini"
@@ -70,10 +70,14 @@ function logTurnItems(
7070
try {
7171
writeLog(process.stdout, JSON.stringify(item), options?.logPrefix);
7272
} catch (error) {
73-
const fallback = isCommandExecutionItem(item)
73+
const sanitizedItem = isCommandExecutionItem(item)
7474
? { ...item, aggregated_output: "<omitted>" }
7575
: item;
76-
writeLog(process.stdout, JSON.stringify(fallback), options?.logPrefix);
76+
writeLog(
77+
process.stdout,
78+
JSON.stringify(sanitizedItem),
79+
options?.logPrefix,
80+
);
7781
if (error instanceof Error) {
7882
writeLog(
7983
process.stderr,

agents/opencode.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,9 @@ const sessionCache = new Map<string, string>();
7171

7272
export const models: string[] = [
7373
// "opencode/gpt-5",
74-
"opencode/gpt-5-codex",
75-
// "opencode/claude-sonnet-4-5",
76-
// "opencode/big-pickle",
74+
// "opencode/gpt-5-codex",
75+
"opencode/claude-sonnet-4-5",
76+
"opencode/big-pickle",
7777
// "opencode/claude-sonnet-4",
7878
// "opencode/claude-3-5-haiku",
7979
// "opencode/claude-opus-4-1",

cli.ts

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,7 @@ import type {
3232
AggregationSummary,
3333
ScoreAggregationInput,
3434
} from "~/lib/utils/scoreAggregation.js";
35-
import type {
36-
Episode,
37-
EvaluationRunExport,
38-
Usage,
39-
} from "~/types/export.js";
35+
import type { Episode, EvaluationRunExport, Usage } from "~/types/export.js";
4036
import { withRetries, withTimeout } from "~/lib/utils/retry.js";
4137
import { buildRadarChartUrl } from "~/lib/charts.js";
4238

@@ -769,7 +765,7 @@ function summarizeAggregation(
769765
to: datasetEval.to,
770766
},
771767
model,
772-
jobUrl: process.env.GITHUB_BENCHMARK_JOB_URL,
768+
jobUrl: process.env.GITHUB_BENCHMARK_JOB_URL!,
773769
finalScore: aggregation.finalScore,
774770
baseScore: aggregation.baseScore,
775771
variancePenalty: aggregation.variancePenalty,

dataset.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
commands:
4242
- ./.venv/bin/pytest -vv
4343
- ./.venv/bin/flake8 datadog_lambda/
44-
4544
- repo: AlaminPu1007/algorithm-visualizer
4645
from: ca409519ec96a83ec8d6c2ba30f2487f8d601719
4746
to: 21845e972dd8e2378cbcd16accc5ae8cdd37acb2

judges.ts

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
11
import type { Judge } from "~/lib/judgeTypes.js";
22
import { getZenLanguageModel } from "~/lib/zenModels.js";
33

4-
const fallback = (envName: string, defaultValue: string): string =>
5-
process.env[envName]?.trim() || defaultValue;
6-
74
function resolveJudgeModelId(judgeName: Judge["name"]): string {
85
switch (judgeName) {
96
case "claude-4.5":
10-
return fallback("CLAUDE_MODEL", "opencode/claude-sonnet-4-5");
7+
return "opencode/claude-sonnet-4-5";
118
case "gpt-5-codex":
12-
return fallback("GPT5_CODEX_MODEL", "opencode/gpt-5-codex");
9+
return "opencode/gpt-5-codex";
1310
case "kimi":
14-
return fallback("KIMI_MODEL", "opencode/kimi-k2");
11+
return "opencode/kimi-k2";
1512
default:
1613
return judgeName;
1714
}

lib/planner.ts

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@ import type { DatasetEval } from "~/lib/dataset.js";
55
import { plannerExamples } from "~/lib/plannerExamples.js";
66
import { getZenLanguageModel } from "~/lib/zenModels.js";
77

8-
const fallback = (envName: string, defaultValue: string): string =>
9-
process.env[envName]?.trim() || defaultValue;
10-
118
export interface PlannerCommitDiff {
129
sha: string;
1310
title: string;
@@ -61,7 +58,7 @@ What NOT to include:
6158
6259
Always respond strictly as JSON conforming to the schema. Do not add commentary.`;
6360

64-
const plannerModelId = fallback("PLANNER_MODEL", "opencode/claude-sonnet-4-5");
61+
const plannerModelId = "opencode/claude-sonnet-4-5";
6562

6663
function buildSystemPrompt(): string {
6764
if (plannerExamples.length === 0) {

lib/summarizer.ts

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@ import { z } from "zod";
44
import type { DatasetEval } from "~/lib/dataset.js";
55
import { getZenLanguageModel } from "~/lib/zenModels.js";
66

7-
const fallback = (envName: string, defaultValue: string): string =>
8-
process.env[envName]?.trim() || defaultValue;
9-
107
export interface EpisodeActions {
118
episodeIndex: number;
129
actions: string[];
@@ -43,10 +40,7 @@ Guidelines:
4340
- Note any errors or issues encountered
4441
- Be objective and descriptive, not evaluative`;
4542

46-
const summarizerModelId = fallback(
47-
"SUMMARIZER_MODEL",
48-
"opencode/claude-sonnet-4-5",
49-
);
43+
const summarizerModelId = "opencode/claude-sonnet-4-5";
5044

5145
export async function generateActionsSummary(
5246
evaluation: DatasetEval,

0 commit comments

Comments
 (0)