Skip to content

Commit 3e37495

Browse files
authored
make stricter completion condition for extract (#151)
* make stricter completion condition for extract * small prompt update * add github stars eval * small eval change * remove unnecessary vars * fix nits
1 parent b3ba2ef commit 3e37495

File tree

2 files changed

+65
-4
lines changed

2 files changed

+65
-4
lines changed

evals/index.eval.ts

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,63 @@ const homedepot = async () => {
390390
}
391391
};
392392

393+
const extract_github_stars = async () => {
394+
const logger = new EvalLogger();
395+
396+
const stagehand = new Stagehand({
397+
env,
398+
verbose: 2,
399+
headless: process.env.HEADLESS !== "false",
400+
logger: (message: { category?: string; message: string }) => {
401+
logger.log(message.message);
402+
},
403+
});
404+
405+
logger.init(stagehand);
406+
407+
const { debugUrl, sessionUrl } = await stagehand.init();
408+
409+
try {
410+
await stagehand.page.goto("https://github.com/facebook/react");
411+
412+
const { stars } = await stagehand.extract({
413+
instruction: "Extract the number of stars for the project",
414+
schema: z.object({
415+
stars: z.number().describe("the number of stars for the project"),
416+
}),
417+
modelName: "gpt-4o-2024-08-06",
418+
});
419+
420+
const expectedStarsString = await stagehand.page
421+
.locator("#repo-stars-counter-star")
422+
.first()
423+
.innerHTML();
424+
425+
const expectedStars = expectedStarsString.toLowerCase().endsWith('k')
426+
? parseFloat(expectedStarsString.slice(0, -1)) * 1000
427+
: parseFloat(expectedStarsString);
428+
429+
await stagehand.context.close().catch(() => {});
430+
return {
431+
_success: stars === expectedStars,
432+
stars,
433+
debugUrl,
434+
sessionUrl,
435+
logs: logger.getLogs(),
436+
};
437+
} catch (error) {
438+
console.error("Error or timeout occurred:", error);
439+
await stagehand.context.close().catch(() => {});
440+
return {
441+
_success: false,
442+
error: JSON.parse(JSON.stringify(error, null, 2)),
443+
debugUrl,
444+
sessionUrl,
445+
logs: logger.getLogs(),
446+
};
447+
}
448+
};
449+
393450
const extract_collaborators_from_github_repository = async () => {
394451
const logger = new EvalLogger();
395452

@@ -1145,6 +1202,7 @@ const tasks = {
11451202
peeler_complex,
11461203
wikipedia,
11471204
simple_google_search,
1205+
extract_github_stars,
11481206
extract_collaborators_from_github_repository,
11491207
extract_last_twenty_github_commits,
11501208
costar,
@@ -1194,6 +1252,7 @@ const testcases = [
11941252
},
11951253
{ input: { name: "peeler_complex" } },
11961254
{ input: { name: "simple_google_search" } },
1255+
{ input: { name: "extract_github_stars" } },
11971256
{
11981257
input: {
11991258
name: "extract_collaborators_from_github_repository",

lib/prompt.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -252,8 +252,10 @@ const metadataSystemPrompt = `You are an AI assistant tasked with evaluating the
252252
Analyze the extraction response and determine if the task is completed or if more information is needed.
253253
254254
Strictly abide by the following criteria:
255-
1. If you are certain that the instruction is completed, set the completion status to true, even if there are still chunks left.
256-
2. If there could still be more information to extract and there are still chunks left, set the completion status to false.`;
255+
1. Once the instruction has been satisfied by the current extraction response, ALWAYS set completion status to true and stop processing, regardless of remaining chunks.
256+
2. Only set completion status to false if BOTH of these conditions are true:
257+
- The instruction has not been satisfied yet
258+
- There are still chunks left to process (chunksTotal > chunksSeen)`;
257259

258260
export function buildMetadataSystemPrompt() {
259261
return {
@@ -272,8 +274,8 @@ export function buildMetadataPrompt(
272274
role: "user",
273275
content: `Instruction: ${instruction}
274276
Extracted content: ${JSON.stringify(extractionResponse, null, 2)}
275-
Chunks seen: ${chunksSeen}
276-
Chunks total: ${chunksTotal}`,
277+
chunksSeen: ${chunksSeen}
278+
chunksTotal: ${chunksTotal}`,
277279
};
278280
}
279281

0 commit comments

Comments
 (0)