Skip to content

Commit f97d703

Browse files
[chore] remove and update evals (#886)
# why - some of these evals were written when stagehand had fundamentally different behaviour, so they needed to be updated to be useful evaluations # what changed - removed some 'un-passable' evals, and updated others to reflect current recommended stagehand usage patterns # test plan - this is it
1 parent 69913fe commit f97d703

11 files changed

+22
-246
lines changed

evals/evals.config.json

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -105,14 +105,6 @@
105105
"name": "costar",
106106
"categories": ["experimental"]
107107
},
108-
{
109-
"name": "expedia",
110-
"categories": ["experimental"]
111-
},
112-
{
113-
"name": "expedia_search",
114-
"categories": ["experimental"]
115-
},
116108
{
117109
"name": "extract_aigrant_companies",
118110
"categories": ["regression"]
@@ -133,10 +125,6 @@
133125
"name": "extract_snowshoeing_destinations",
134126
"categories": ["experimental"]
135127
},
136-
{
137-
"name": "google_jobs",
138-
"categories": ["experimental"]
139-
},
140128
{
141129
"name": "homedepot",
142130
"categories": ["experimental"]

evals/tasks/expedia.ts

Lines changed: 0 additions & 45 deletions
This file was deleted.

evals/tasks/expedia_search.ts

Lines changed: 0 additions & 58 deletions
This file was deleted.

evals/tasks/extract_partners.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@ export const extract_partners: EvalFunction = async ({
1111
await stagehand.page.goto("https://ramp.com");
1212

1313
await stagehand.page.act({
14-
action: "move down to the bottom of the page.",
14+
action: "scroll to the bottom of the page",
1515
});
1616

1717
await stagehand.page.act({
1818
action: "Close the popup.",
1919
});
2020

2121
await stagehand.page.act({
22-
action: "Find and click on the link that leads to the partners page.",
22+
action: "click on the link that leads to the partners page.",
2323
});
2424

2525
const partners = await stagehand.page.extract({

evals/tasks/extract_press_releases.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,6 @@ export const extract_press_releases: EvalFunction = async ({
2424
try {
2525
await stagehand.page.goto(
2626
"https://browserbase.github.io/stagehand-eval-sites/sites/press-releases/",
27-
{
28-
waitUntil: "networkidle",
29-
},
3027
);
3128
await new Promise((resolve) => setTimeout(resolve, 5000));
3229

evals/tasks/google_jobs.ts

Lines changed: 0 additions & 94 deletions
This file was deleted.

evals/tasks/homedepot.ts

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,17 @@ export const homedepot: EvalFunction = async ({
99
}) => {
1010
try {
1111
await stagehand.page.goto("https://www.homedepot.com/");
12-
await stagehand.page.act("search for gas grills");
12+
await stagehand.page.act("enter 'gas grills' in the search bar");
13+
await stagehand.page.act("press enter");
1314
await stagehand.page.act("click on the best selling gas grill");
1415
await stagehand.page.act("click on the Product Details");
15-
await stagehand.page.act("find the Primary Burner BTU");
1616

1717
const productSpecs = await stagehand.page.extract({
1818
instruction: "Extract the Primary exact Burner BTU of the product",
1919
schema: z.object({
20-
productSpecs: z
21-
.array(
22-
z.object({
23-
burnerBTU: z.string().describe("Primary Burner BTU exact value"),
24-
}),
25-
)
26-
.describe("Gas grill Primary Burner BTU exact value"),
20+
productSpecs: z.object({
21+
burnerBTU: z.number().describe("Primary Burner BTU exact value"),
22+
}),
2723
}),
2824
});
2925

@@ -38,11 +34,7 @@ export const homedepot: EvalFunction = async ({
3834
},
3935
});
4036

41-
if (
42-
!productSpecs ||
43-
!productSpecs.productSpecs ||
44-
productSpecs.productSpecs.length !== 1
45-
) {
37+
if (!productSpecs || !productSpecs.productSpecs) {
4638
await stagehand.close();
4739

4840
return {
@@ -54,14 +46,12 @@ export const homedepot: EvalFunction = async ({
5446
};
5547
}
5648

57-
const hasFourZerosAndOne4 =
58-
(productSpecs.productSpecs[0].burnerBTU.match(/0/g) || []).length === 4 &&
59-
(productSpecs.productSpecs[0].burnerBTU.match(/4/g) || []).length === 1;
49+
const isLargerThan1000 = productSpecs.productSpecs.burnerBTU >= 10000;
6050

6151
await stagehand.close();
6252

6353
return {
64-
_success: hasFourZerosAndOne4,
54+
_success: isLargerThan1000,
6555
productSpecs,
6656
debugUrl,
6757
sessionUrl,

evals/tasks/os_dropdown.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,7 @@ export const os_dropdown: EvalFunction = async ({
2121
"choose 'Smog Check Technician' from the 'License Type' dropdown",
2222
);
2323
const selectedOption = await page
24-
.locator(
25-
"xpath=/html/body/form/div[1]/div[3]/article/div[2]/div[1]/select[2] >> option:checked",
26-
)
24+
.locator("#licenseType >> option:checked")
2725
.textContent();
2826

2927
if (selectedOption === "Smog Check Technician") {

evals/tasks/rakuten_jp.ts

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,12 @@ export const rakuten_jp: EvalFunction = async ({
77
logger,
88
}) => {
99
await stagehand.page.goto("https://www.rakuten.co.jp/");
10-
await stagehand.page.act({ action: "click on online supermarket" });
11-
12-
await stagehand.page.act({ action: "if there is a popup, close it" });
13-
14-
await stagehand.page.act({
15-
action: "navigate to Inageya Online Supermarket",
16-
});
17-
await stagehand.page.act({ action: "click the search bar input" });
18-
await stagehand.page.act({ action: "search for '香菜'" });
1910

11+
await stagehand.page.act({ action: "type '香菜' into the search bar" });
12+
await stagehand.page.act({ action: "press enter" });
2013
const url = stagehand.page.url();
2114
const successUrl =
22-
"https://netsuper.rakuten.co.jp/inageya/search/?keyword=%E9%A6%99%E8%8F%9C";
15+
"https://search.rakuten.co.jp/search/mall/%E9%A6%99%E8%8F%9C/";
2316

2417
await stagehand.close();
2518

evals/tasks/ted_talk.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@ export const ted_talk: EvalFunction = async ({
1414
waitUntil: "domcontentloaded",
1515
},
1616
);
17+
18+
await stagehand.page.act({
19+
action: "scroll 10% down the page",
20+
});
21+
22+
await new Promise((resolve) => setTimeout(resolve, 5000));
23+
1724
await stagehand.page.act({
1825
action:
1926
"Click the link that takes you to the page about the 'Culture' topic",

0 commit comments

Comments
 (0)