@@ -17,6 +17,28 @@ permissions:
1717 actions : read
1818
1919jobs :
20+ prepare-analysis :
21+ name : Prepare Judge Analysis Matrix
22+ runs-on : ubuntu-latest
23+ outputs :
24+ evals : ${{ steps.compute.outputs.evals }}
25+ steps :
26+ - name : Extract unique evaluations
27+ id : compute
28+ env :
29+ MATRIX_JSON : ${{ inputs.matrix }}
30+ run : |
31+ set -euo pipefail
32+ evals=$(jq -c '.include | unique_by(.eval) | map({ eval: .eval, safe: (.eval | gsub("/"; "-")) })' <<<"$MATRIX_JSON")
33+
34+ if [ -z "${evals}" ] || [ "${evals}" = "null" ]; then
35+ echo "No evaluations found in matrix definition." >&2
36+ evals="[]"
37+ fi
38+
39+ echo "Analysis eval matrix: ${evals}"
40+ echo "evals=${evals}" >> "$GITHUB_OUTPUT"
41+
2042 benchmark :
2143 name : Benchmark ${{ matrix.agent }} / ${{ matrix.model }} / ${{ matrix.eval }}
2244 runs-on : ubuntu-latest
@@ -167,9 +189,111 @@ jobs:
167189 name : ${{ steps.artifact.outputs.name }}
168190 path : benchmark.json
169191
192+ eval-analysis :
193+ name : Judge Analysis - ${{ matrix.eval }}
194+ runs-on : ubuntu-latest
195+ needs :
196+ - benchmark
197+ - prepare-analysis
198+ if : needs.prepare-analysis.outputs.evals != '[]'
199+ environment : production
200+ strategy :
201+ fail-fast : false
202+ matrix :
203+ include : ${{ fromJSON(needs.prepare-analysis.outputs.evals) }}
204+ steps :
205+ - name : Checkout repository
206+ uses : actions/checkout@v4
207+
208+ - name : Setup Bun
209+ uses : oven-sh/setup-bun@v1
210+ with :
211+ bun-version : 1.2.21
212+
213+ - name : Install dependencies
214+ run : bun install --frozen-lockfile
215+
216+ - name : Download benchmark artifacts for eval
217+ uses : actions/download-artifact@v4
218+ with :
219+ path : eval-benchmarks
220+ pattern : benchmark-*-*-${{ matrix.safe }}
221+
222+ - name : Merge benchmark exports
223+ id : merge
224+ run : |
225+ set -euo pipefail
226+
227+ if [ ! -d eval-benchmarks ] || ! find eval-benchmarks -type f -name '*.json' -print -quit | grep -q .; then
228+ echo "No benchmark artifacts found for eval ${{ matrix.eval }}; skipping analysis."
229+ echo "has_data=false" >> "$GITHUB_OUTPUT"
230+ exit 0
231+ fi
232+
233+ bun run scripts/merge-benchmark-exports.ts eval-benchmarks merged-benchmark.json
234+ echo "Merged benchmark export ready for analysis."
235+ echo "has_data=true" >> "$GITHUB_OUTPUT"
236+
237+ - name : Run judges analysis
238+ if : steps.merge.outputs.has_data == 'true'
239+ id : analysis
240+ env :
241+ OPENCODE_API_KEY : ${{ secrets.OPENCODE_API_KEY }}
242+ ANTHROPIC_API_KEY : ${{ secrets.ANTHROPIC_API_KEY }}
243+ CODEX_API_KEY : ${{ secrets.CODEX_API_KEY }}
244+ run : |
245+ set -euo pipefail
246+ bun run scripts/analysis.ts merged-benchmark.json > analysis.txt
247+ cat analysis.txt
248+
249+ - name : Determine analysis job URL
250+ if : steps.merge.outputs.has_data == 'true'
251+ id : analysis_url
252+ env :
253+ GITHUB_TOKEN : ${{ secrets.GITHUB_TOKEN }}
254+ MATRIX_EVAL : ${{ matrix.eval }}
255+ MATRIX_SAFE : ${{ matrix.safe }}
256+ run : |
257+ set -euo pipefail
258+ job_pattern="Judge Analysis - ${MATRIX_EVAL}"
259+ jobs_endpoint="https://api.github.com/repos/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/jobs?per_page=100"
260+ job_json="$(curl -fsSL \
261+ -H "Authorization: token ${GITHUB_TOKEN}" \
262+ -H "Accept: application/vnd.github+json" \
263+ "${jobs_endpoint}")"
264+
265+ job_url="$(printf '%s\n' "${job_json}" \
266+ | jq -r --arg pattern "$job_pattern" 'select(type=="object" and has("jobs")) | .jobs[] | select(.name | contains($pattern)) | .html_url' \
267+ | head -n 1)"
268+
269+ if [ -z "${job_url}" ] || [ "${job_url}" = "null" ]; then
270+ echo "Failed to determine job URL for pattern ${job_pattern}" >&2
271+ printf '%s\n' "${job_json}" | jq -r '.jobs[]?.name' >&2 || true
272+ exit 1
273+ fi
274+
275+ step_url="${job_url}#step:7:0"
276+
277+ jq -n \
278+ --arg eval "${MATRIX_EVAL}" \
279+ --arg safe "${MATRIX_SAFE}" \
280+ --arg url "${step_url}" \
281+ '{eval: $eval, safe: $safe, url: $url}' > analysis-info.json
282+
283+ - name : Upload analysis artifact
284+ if : steps.merge.outputs.has_data == 'true'
285+ uses : actions/upload-artifact@v4
286+ with :
287+ name : analysis-${{ matrix.safe }}
288+ path : |
289+ analysis.txt
290+ analysis-info.json
291+
170292 notify :
171293 runs-on : ubuntu-latest
172- needs : benchmark
294+ needs :
295+ - benchmark
296+ - eval-analysis
173297 if : needs.benchmark.result == 'success'
174298 environment : production
175299 steps :
@@ -189,12 +313,30 @@ jobs:
189313 with :
190314 path : benchmarks
191315
316+ - name : Download analysis artifacts
317+ uses : actions/download-artifact@v4
318+ with :
319+ path : analysis
320+ pattern : analysis-*
321+
192322 - name : Merge benchmark exports
193323 run : bun run scripts/merge-benchmark-exports.ts benchmarks merged-benchmark.json
194324
325+ - name : Build analysis link map
326+ run : |
327+ set -euo pipefail
328+ mkdir -p analysis
329+ mapfile -d '' info_files < <(find analysis -type f -name 'analysis-info.json' -print0 2>/dev/null || true)
330+ if [ "${#info_files[@]}" -eq 0 ]; then
331+ echo "[]" > analysis/analysis-links.json
332+ else
333+ jq -s 'map({eval: .eval, url: .url, safe: .safe})' "${info_files[@]}" > analysis/analysis-links.json
334+ fi
335+
195336 - name : Send Discord notification
196337 env :
197338 DISCORD_WEBHOOK_URL : ${{ secrets.DISCORD_WEBHOOK_URL }}
339+ ANALYSIS_LINKS_FILE : ${{ github.workspace }}/analysis/analysis-links.json
198340 run : |
199341 set -euo pipefail
200342 if [ ! -f merged-benchmark.json ]; then
0 commit comments