Skip to content

Commit ea446df

Browse files
update: create prompts for each eval in dataset.yml (#10)
* wip * wip --------- Co-authored-by: Mohammad Bagher Abiyat <37929992+Aslemammad@users.noreply.github.com>
1 parent dec0c92 commit ea446df

File tree

10 files changed

+208
-76
lines changed

10 files changed

+208
-76
lines changed

cli.ts

Lines changed: 72 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,25 @@
1-
#!/usr/bin/env node
1+
#!/usr/bin/env bun
22
import { strict as assert } from "node:assert";
33
import process from "node:process";
44

55
import { execSync } from "node:child_process";
66
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
77
import { dirname, join } from "node:path";
88
import { tmpdir } from "node:os";
9-
10-
import { createOpencode } from "@opencode-ai/sdk";
11-
import detectPort from "detect-port";
12-
139
import { getAgent, listAgents } from "~/agents/index.js";
1410
import type { AgentRegistration } from "~/agents/index.js";
1511
import { listScores, scores as scoreRegistry } from "~/scores/index.js";
1612
import { dataset } from "~/lib/dataset.js";
1713
import type { DatasetEval, ScoreAssignment } from "~/lib/dataset.js";
18-
import { generatePlannerTasks, type PlannerTask } from "~/lib/planner.js";
14+
import { generatePromptsForEval } from "~/lib/prompts.js";
1915
import {
2016
generateActionsSummary,
2117
type EpisodeActions,
2218
} from "~/lib/summarizer.js";
23-
import { fetchPlannerCommitDiffs } from "~/lib/github.js";
2419
import { finalizeAgentChanges } from "~/lib/finalizeAgentChanges.js";
20+
import { loadPromptsFile } from "~/lib/prompts.js";
2521
import { judges, getJudgeModelId } from "~/judges.js";
26-
import {
27-
aggregateScores,
28-
averageJudgeScore,
29-
} from "~/lib/utils/scoreAggregation.js";
22+
import { aggregateScores } from "~/lib/utils/scoreAggregation.js";
3023
import type { Judge } from "~/lib/judgeTypes.js";
3124
import type {
3225
AggregationSummary,
@@ -155,6 +148,62 @@ function isHelpRequest(arg: string | undefined): boolean {
155148
);
156149
}
157150

151+
async function handlePrompts(args: string[]): Promise<void> {
152+
if (args.length === 0 || args[0] === "--help" || args[0] === "-h") {
153+
console.log("Usage: orvl prompts [--eval <repo>] ");
154+
console.log("");
155+
console.log("Options:");
156+
console.log(
157+
" --eval <repo> Generate prompts for a specific evaluation (e.g., DataDog/datadog-lambda-python)",
158+
);
159+
console.log("");
160+
console.log("Examples:");
161+
console.log(" orvl prompts --eval DataDog/datadog-lambda-python");
162+
return;
163+
}
164+
165+
let generateAll = true;
166+
let targetEval: string | undefined;
167+
168+
for (let i = 0; i < args.length; i++) {
169+
const arg = args[i];
170+
if (arg === "--eval") {
171+
generateAll = false;
172+
i++;
173+
targetEval = args[i];
174+
assert(targetEval, "Option --eval requires a value");
175+
} else {
176+
console.error(`Unknown option: ${arg}`);
177+
process.exitCode = 1;
178+
return;
179+
}
180+
}
181+
182+
let evalsToGenerate: DatasetEval[] = [];
183+
184+
if (generateAll) {
185+
evalsToGenerate = [...dataset];
186+
} else if (targetEval) {
187+
const evalDef = dataset.find((entry) => entry.repo === targetEval);
188+
if (!evalDef) {
189+
console.error(`Evaluation not found: ${targetEval}`);
190+
console.error("Available evaluations:");
191+
dataset.forEach((entry) => console.error(` - ${entry.repo}`));
192+
process.exitCode = 1;
193+
return;
194+
}
195+
evalsToGenerate = [evalDef];
196+
}
197+
198+
console.log(
199+
`Generating prompts for ${evalsToGenerate.length} evaluation(s)...\n`,
200+
);
201+
202+
await Promise.all(
203+
evalsToGenerate.map((evalDef) => generatePromptsForEval(evalDef)),
204+
);
205+
}
206+
158207
async function main(): Promise<void> {
159208
const args = process.argv.slice(2);
160209
const agentName = args[0];
@@ -164,6 +213,12 @@ async function main(): Promise<void> {
164213
return;
165214
}
166215

216+
// Handle special commands
217+
if (agentName === "prompts") {
218+
await handlePrompts(args.slice(1));
219+
return;
220+
}
221+
167222
let options: ParsedCliOptions;
168223
try {
169224
options = parseOptions(args.slice(1));
@@ -215,34 +270,12 @@ async function main(): Promise<void> {
215270

216271
const evalId = evalDefinition.repo;
217272

218-
let plannerTasks: PlannerTask[] = [];
219-
220-
try {
221-
console.log(`[${evalId} planner] Fetching commit diffs from GitHub...`);
222-
const commitDiffs = await fetchPlannerCommitDiffs(evalDefinition);
223-
224-
assert(
225-
commitDiffs.length > 0,
226-
`No commits found between ${evalDefinition.from} and ${evalDefinition.to} for ${evalDefinition.repo}.`,
227-
);
273+
const tasks = loadPromptsFile(evalDefinition.prompts);
228274

229-
plannerTasks = await generatePlannerTasks(evalDefinition, commitDiffs);
230-
231-
assert(
232-
plannerTasks.length > 0,
233-
`Planner produced no tasks for ${evalDefinition.repo} (${evalDefinition.from}..${evalDefinition.to}).`,
234-
);
235-
} catch (error) {
236-
if (error instanceof Error) {
237-
console.error(
238-
`Failed to prepare evaluation ${evalId}: ${error.message}`,
239-
);
240-
} else {
241-
console.error("Failed to prepare evaluation", evalId);
242-
}
243-
process.exitCode = 1;
244-
assert(false, "evaluation preparation failed");
245-
}
275+
assert(
276+
tasks.length > 0,
277+
`No prompts found in ${evalDefinition.prompts} for ${evalDefinition.repo}.`,
278+
);
246279

247280
const executeCombination = async (): Promise<{
248281
lines: string[];
@@ -309,7 +342,7 @@ async function main(): Promise<void> {
309342
let usage: Usage = { input: 0, output: 0 };
310343
const episodeActions: string[] = [];
311344

312-
for (const task of plannerTasks) {
345+
for (const task of tasks) {
313346
const logPrefix = `${prefix} ${task.commit}`;
314347

315348
try {

dataset.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
- repo: prismicio-community/course-fizzi-next
22
from: 15037446358508e153e765da49f8f5defa7fbbf6
33
to: 2760114f2647ebec8f63e0ecc2dc87a8cd4096ac
4+
prompts: prompts/course-fizzi-next.yaml
45
issues: []
56
scores:
67
api-signature:
@@ -22,6 +23,7 @@
2223
- repo: DataDog/datadog-lambda-python
2324
from: 93d4a07fa61a4d4d2feec08e722505a9e0cc8657
2425
to: d7763789f262b2da228f8210509e302e6e510d0a
26+
prompts: prompts/datadog-lambda-python.yaml
2527
issues: []
2628
scores:
2729
api-signature:
@@ -44,6 +46,7 @@
4446
- repo: AlaminPu1007/algorithm-visualizer
4547
from: ca409519ec96a83ec8d6c2ba30f2487f8d601719
4648
to: 21845e972dd8e2378cbcd16accc5ae8cdd37acb2
49+
prompts: prompts/algorithm-visualizer.yaml
4750
issues: []
4851
scores:
4952
api-signature:

lib/dataset.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ const datasetSchema = z.array(
1616
.regex(/^[^/]+\/[^/]+$/, "repo must follow the format <owner>/<name>."),
1717
from: z.string().min(1, "from commit SHA is required."),
1818
to: z.string().min(1, "to commit SHA is required."),
19+
prompts: z.string().min(1, "prompts file path is required."),
1920
issues: z.array(z.number().int()),
2021
scores: z.record(scoreConfigSchema)
2122
})

lib/github.ts

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import { request as octokitRequest } from "@octokit/request";
44
import type { RequestInterface } from "@octokit/types";
55

66
import type { DatasetEval } from "~/lib/dataset.js";
7-
import type { PlannerCommitDiff } from "~/lib/planner.js";
87

98
const DIFF_ACCEPT_HEADER = "application/vnd.github.v3.diff";
109

@@ -64,9 +63,15 @@ export async function fetchComparisonDiff(entry: DatasetEval): Promise<string> {
6463
return diff;
6564
}
6665

67-
export async function fetchPlannerCommitDiffs(
66+
export interface CommitDiff {
67+
sha: string;
68+
title: string;
69+
diff: string;
70+
}
71+
72+
export async function fetchCommitDiffs(
6873
entry: DatasetEval,
69-
): Promise<PlannerCommitDiff[]> {
74+
): Promise<CommitDiff[]> {
7075
const client = getRequestClient();
7176
const { owner, repo } = splitRepo(entry);
7277

@@ -124,7 +129,7 @@ export async function fetchPlannerCommitDiffs(
124129
sha,
125130
title,
126131
diff,
127-
} satisfies PlannerCommitDiff;
132+
} satisfies CommitDiff;
128133
} catch (error) {
129134
console.error(
130135
`Failed to fetch diff for commit ${sha} in ${entry.repo}:`,
@@ -135,5 +140,5 @@ export async function fetchPlannerCommitDiffs(
135140
}),
136141
);
137142

138-
return results.filter((value): value is PlannerCommitDiff => value !== null);
143+
return results.filter((value): value is CommitDiff => value !== null);
139144
}

0 commit comments

Comments
 (0)